def build_keras_input():
    filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p'

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK.')
        return (data, W)

    # load data from pickle
    texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.5).csv')

    vocab = get_vocab(texts)

    # using word2vec vectors
    # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    word_vecs = load_embeddings(
        'zh',
        '/home/hs/Data/wikipedia/word2vec_word/traditional_wordvecs/wiki.zh.text.traditional_wordvecs.txt'
    )

    # load glove vectors
    # word_vecs = load_embeddings(arg='glove')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    idx_data = make_idx_data(texts, word_idx_map)

    data = (idx_data, valence, arousal)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    return (data, W)
예제 #2
0
def build_keras_input():
    filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p'

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK.')
        return (data, W)

    # load data from pickle
    texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.5).csv')

    vocab = get_vocab(texts)

    # using word2vec vectors
    # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    word_vecs = load_embeddings('zh',
                                '/home/hs/Data/wikipedia/word2vec_word/traditional_wordvecs/wiki.zh.text.traditional_wordvecs.txt')

    # load glove vectors
    # word_vecs = load_embeddings(arg='glove')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    idx_data = make_idx_data(texts, word_idx_map)

    data = (idx_data, valence, arousal)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    return (data, W)
예제 #3
0
def visual_pos_neg_vecs(amended_pos_path='./tmp/amended_pos.p',
                        amended_neg_path='./tmp/amended_neg.p'):
    amended_pos = load_pickle(amended_pos_path)
    amended_neg = load_pickle(amended_neg_path)
    nb_pos, nb_neg = len(amended_pos), len(amended_neg)
    print('There are %s positive words, and %s negative words.' %
          (nb_pos, nb_neg))
    num = 500
    vecs = [v for v in list(amended_pos.values())[:num]
            ] + [v for v in list(amended_neg.values())[:num]]
    vecs = np.array(vecs)
    print('The shape of vecs is : %s row * %s columns.' % (vecs.shape))
    reduced_vecs = t_sne(vecs)
    print('The shape of reduced vecs is : %s row * %s columns.' %
          (reduced_vecs.shape))

    for i, vec in enumerate(vecs):
        if i < num:  # pos
            color = 'r'
        else:  # neg
            color = 'b'
        plt.plot(reduced_vecs[i, 0],
                 reduced_vecs[i, 1],
                 marker='o',
                 color=color,
                 markersize=8)
    plt.show()
예제 #4
0
def build_data():
    positive_data = load_pickle('./tmp/amended_pos.p')
    negative_data = load_pickle('./tmp/amended_neg.p')
    X, Y = [], []
    for pos in positive_data.keys():
        X.append(positive_data[pos])
        Y.append(1)
    for neg in negative_data.keys():
        X.append(negative_data[neg])
        Y.append(0)
    return np.array(X), np.array(Y)
예제 #5
0
def build_data():
    positive_data = load_pickle('./tmp/amended_pos.p')
    negative_data = load_pickle('./tmp/amended_neg.p')
    X, Y = [], []
    for pos in positive_data.keys():
        X.append(positive_data[pos])
        Y.append(1)
    for neg in negative_data.keys():
        X.append(negative_data[neg])
        Y.append(0)
    return np.array(X), np.array(Y)
def build_keras_input(texts, scores, test, new=True):
    dims = 300

    # texts, scores are dict type, key: train, dev, devtest.
    keys = ["train", "dev", "devtest"]
    train, train_scores = texts[keys[0]], scores[keys[0]]
    dev, dev_scores = texts[keys[1]], scores[keys[1]]
    devtest, devtest_scores = texts[keys[2]], scores[keys[2]]

    filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p'

    test_filename = './tmp/test_data.p'

    if os.path.isfile(filename_data) and os.path.isfile(filename_w) and new == False:
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)

        test_data = load_pickle(test_filename)

        print('Use existing data. Load OK.')
        return (data, W, test_data)

    print("Construct new data.")
    # load data from pickle

    vocab = get_vocab(train)

    # using word2vec vectors
    # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    # word_vecs = load_embeddings('D:/Word_Embeddings/glove.840B.300d.txt.w2v')
    word_vecs = load_embeddings('/home/hs/Data/Word_Embeddings/glove.840B.300d.txt.w2v')
    # word_vecs = load_embeddings('/home/hs/Data/Word_Embeddings/word2vec_twitter_model/word2vec_twitter_model.bin',
    #                             binary=True)

    word_vecs = add_unknown_words(word_vecs, vocab, k=dims)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab, k=dims)

    idx_data_train = make_idx_data(train, word_idx_map)
    idx_data_dev = make_idx_data(dev, word_idx_map)
    idx_data_devtest = make_idx_data(devtest, word_idx_map)

    idx_data_test = make_idx_data(test[2], word_idx_map)

    data = (idx_data_train, idx_data_dev, idx_data_devtest, train_scores, dev_scores, devtest_scores)

    test_data = (test[0], test[1], idx_data_test)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    dump_picle(test_data, test_filename)
    print("Saved: data and W are saved into: %s, and %s." % (filename_data, filename_w))

    return (data, W, test_data)
예제 #7
0
def cnn_Chinese(text=None):
    ########################### file_path ##############################
    embedding_matrix = './data/tmp/embedding_matrix_CVAT.p'
    word_idx_map = './data/tmp/word_idx_map_CVAT.p'
    cnn_model_weights_Valence = './data/tmp/CVAT_cnn_model_weights_Valence.hdf5'
    cnn_model_weights_Arousal = './data/tmp/CVAT_cnn_model_weights_Arousal.hdf5'
    ####################################################################
    request_text = text
    W = load_pickle(embedding_matrix)
    # print(len(W[1]))
    if request_text is None:
        request_text = '中文斷詞前言自然語言處理的其中一個重要環節就是中文斷詞的'
    # request_text = clean_str(request_text)
    # print(request_text)
    request_text = list(jieba.cut(request_text))
    word_idx_map = load_pickle(word_idx_map)

    idx_request_text = get_idx_from_sent(request_text, word_idx_map)
    print(idx_request_text)  # type: list
    max_len = len(idx_request_text)
    idx_request_text = np.array(idx_request_text).reshape((1, max_len))
    print(idx_request_text.shape)

    def cnn_model():
        N_fm = 400  # number of filters
        kernel_size = 8
        conv_input_height, conv_input_width = max_len, len(W[1])

        model = Sequential()
        model.add(Embedding(input_dim=W.shape[0], output_dim=W.shape[1], weights=[W], W_constraint=unitnorm()))
        model.add(Reshape(dims=(1, conv_input_height, conv_input_width)))
        model.add(Convolution2D(nb_filter=N_fm,
                                nb_row=kernel_size,
                                nb_col=conv_input_width,
                                border_mode='valid',
                                W_regularizer=l2(0.0001)))
        model.add(Activation("relu"))
        model.add(MaxPooling2D(pool_size=(conv_input_height - kernel_size + 1, 1), ignore_border=True))
        model.add(Flatten())
        model.add(Dropout(0.5))
        model.add(Dense(1))
        model.add(Activation('linear'))
        model.compile(loss='mse', optimizer='adagrad')
        return model

    model = cnn_model()
    model.load_weights(cnn_model_weights_Valence)
    valence = model.predict(idx_request_text)

    model.load_weights(cnn_model_weights_Arousal)
    arousal = model.predict(idx_request_text)

    return [valence[0], arousal[0]]
예제 #8
0
def build_amended_vectors(arg='word2vec'):
    prefix = None if arg == 'word2vec' else 'GloVe_'
    pos_vectors = load_pickle('./tmp/'+prefix+'common_positive_words.p')
    neg_vectors = load_pickle('./tmp/'+prefix+'common_negative_words.p')
    size = len(pos_vectors[list(pos_vectors.keys())[0]])
    print('The dimension of word vectors: %s.' % size)
    for k in pos_vectors:
        pos_vectors[k]=np.array(pos_vectors[k]).reshape((1, size))
    for k in neg_vectors:
        neg_vectors[k]=np.array(neg_vectors[k]).reshape((1, size))
    amended_pos, amended_neg = amend(pos_vectors, neg_vectors)
    dump_picle(amended_pos, './tmp/amended_'+prefix+'pos.p')
    dump_picle(amended_neg, './tmp/amended_'+prefix+'neg.p')
예제 #9
0
def cnn(text=None):
    request_text = text
    # Test
    [idx_data, ratings] = load_pickle('./data/corpus/vader/vader_processed_data_tweets.p')
    # print(idx_data[2])
    # print(ratings[2])

    W = load_pickle('./data/corpus/vader/embedding_matrix_tweets.p')
    # print(len(W[1]))
    if request_text is None:
        request_text = 'why you are not happy'
    request_text = clean_str(request_text)
    # print(request_text)
    word_idx_map = load_pickle('./data/corpus/vader/word_idx_map_tweets.p')

    idx_request_text = get_idx_from_sent(request_text, word_idx_map)
    # print(idx_request_text)  # type: list
    max_len = len(idx_request_text)
    idx_request_text = np.array(idx_request_text).reshape((1,max_len))
    # print(idx_request_text.shape)

    def cnn_model():
        N_fm = 100  # number of filters
        kernel_size = 5
        conv_input_height, conv_input_width = max_len, len(W[1])

        model = Sequential()
        model.add(Embedding(input_dim=W.shape[0], output_dim=W.shape[1], weights=[W], W_constraint=unitnorm()))
        model.add(Reshape(dims=(1, conv_input_height, conv_input_width)))
        model.add(Convolution2D(nb_filter=N_fm,
                                nb_row=kernel_size,
                                nb_col=conv_input_width,
                                border_mode='valid',
                                W_regularizer=l2(0.0001)))
        model.add(Activation("relu"))
        model.add(MaxPooling2D(pool_size=(conv_input_height - kernel_size + 1, 1), ignore_border=True))
        model.add(Flatten())
        model.add(Dropout(0.5))
        model.add(Dense(1))
        model.add(Activation('linear'))
        sgd = SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True)
        model.compile(loss='mse', optimizer='adagrad')
        return model



    model = cnn_model()
    model.load_weights('./data/corpus/vader/cnn_model_weights.hdf5')
    predict_value = model.predict(idx_request_text)

    return [predict_value[0], 5.0]
예제 #10
0
def result_analysis(filename):
    (param_grid, param_fitness) = load_pickle(filename)
    grid = ParameterGrid(param_grid)
    N=10         # top-n
    top_n_ind = np.argsort(param_fitness)[::-1][:N]         # top-n max value index

    for i in top_n_ind:
        print('Parameter setting: %s, acc: %s' % (str(list(grid)[i]), param_fitness[i]))
def build_amended_anew_vectors(words):
    filename = './tmp/retrofitted_anew_vectors_word2vec.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\word2vec_out_vec_file.txt")
    amended_pos = load_pickle('./tmp/retrofitted_word2vec_pos.p')
    amended_neg = load_pickle('./tmp/retrofitted_word2vec_neg.p')
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
def build_amended_anew_vectors(words):
    filename = './tmp/amended_anew_vectors_glove.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt")
    amended_pos = load_pickle('./tmp/amended_GloVe_pos.p')
    amended_neg = load_pickle('./tmp/amended_GloVe_neg.p')
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
예제 #13
0
def build_amended_anew_vectors(words):
    filename = "./tmp/amended_anew_vectors.p"
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings("google_news", "/home/hs/Data/Word_Embeddings/google_news.bin")
    amended_pos = load_pickle("./tmp/amended_pos.p")
    amended_neg = load_pickle("./tmp/amended_neg.p")
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
def build_keras_input_amended():
    filename_data, filename_w = './tmp/amended_indexed_data.p', './tmp/amended_Weight.p'

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK.')
        return (data, W)

    # load data from pickle
    (x_train, y_train_valence, y_train_labels,
     x_test, y_test_valence, y_test_labels,
     x_valid, y_valid_valence, y_valid_labels,
     x_train_polarity, y_train_polarity,
     x_test_polarity, y_test_polarity,
     x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/')

    vocab = get_vocab(x_train)
    # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    # word_vecs = load_embeddings('glove')

    # load amended word vectors
    word_vecs = load_embeddings('amended_word2vec')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    x_train_idx_data = make_idx_data(x_train, word_idx_map)
    x_test_idx_data = make_idx_data(x_test, word_idx_map)
    x_valid_idx_data = make_idx_data(x_valid, word_idx_map)
    x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map)
    x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map)
    x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map)

    data = (x_train_idx_data, y_train_valence, y_train_labels,
            x_test_idx_data, y_test_valence, y_test_labels,
            x_valid_idx_data, y_valid_valence, y_valid_labels,
            x_train_polarity_idx_data, y_train_polarity,
            x_test_polarity_idx_data, y_test_polarity,
            x_valid_polarity_idx_data, y_valid_polarity)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    return (data, W)
예제 #15
0
def build_keras_input_amended():
    filename_data, filename_w = './tmp/amended_indexed_data.p', './tmp/amended_Weight.p'

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK.')
        return (data, W)

    # load data from pickle
    (x_train, y_train_valence, y_train_labels,
     x_test, y_test_valence, y_test_labels,
     x_valid, y_valid_valence, y_valid_labels,
     x_train_polarity, y_train_polarity,
     x_test_polarity, y_test_polarity,
     x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/')

    vocab = get_vocab(x_train)
    # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    # word_vecs = load_embeddings('glove')

    # load amended word vectors
    word_vecs = load_embeddings('amended_word2vec')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    x_train_idx_data = make_idx_data(x_train, word_idx_map)
    x_test_idx_data = make_idx_data(x_test, word_idx_map)
    x_valid_idx_data = make_idx_data(x_valid, word_idx_map)
    x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map)
    x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map)
    x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map)

    data = (x_train_idx_data, y_train_valence, y_train_labels,
            x_test_idx_data, y_test_valence, y_test_labels,
            x_valid_idx_data, y_valid_valence, y_valid_labels,
            x_train_polarity_idx_data, y_train_polarity,
            x_test_polarity_idx_data, y_test_polarity,
            x_valid_polarity_idx_data, y_valid_polarity)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    return (data, W)
예제 #16
0
def build_amended_anew_vectors(words):
    filename = './tmp/amended_anew_vectors.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings('google_news',
                            '/home/hs/Data/Word_Embeddings/google_news.bin')
    amended_pos = load_pickle('./tmp/amended_pos.p')
    amended_neg = load_pickle('./tmp/amended_neg.p')
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
예제 #17
0
def visual_pos_neg_vecs(amended_pos_path='./tmp/amended_pos.p', amended_neg_path='./tmp/amended_neg.p'):
    amended_pos = load_pickle(amended_pos_path)
    amended_neg = load_pickle(amended_neg_path)
    nb_pos, nb_neg = len(amended_pos), len(amended_neg)
    print('There are %s positive words, and %s negative words.' % (nb_pos, nb_neg))
    num = 500
    vecs= [v for v in list(amended_pos.values())[:num]] + [v for v in list(amended_neg.values())[:num]]
    vecs = np.array(vecs)
    print('The shape of vecs is : %s row * %s columns.'%(vecs.shape))
    reduced_vecs = t_sne(vecs)
    print('The shape of reduced vecs is : %s row * %s columns.'%(reduced_vecs.shape))

    for i, vec in enumerate(vecs):
        if i < num:     # pos
            color = 'r'
        else:           # neg
            color = 'b'
        plt.plot(reduced_vecs[i,0], reduced_vecs[i,1], marker='o', color=color, markersize=8)
    plt.show()
예제 #18
0
def build_ori_anew_vectors(words):
    filename = "./tmp/anew_vectors.p"
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings("google_news", "/home/hs/Data/Word_Embeddings/google_news.bin")
    vecs = []
    for w in words:
        vecs.append(model[w])
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
예제 #19
0
def build_amended_anew_vectors(words):
    filename = './tmp/retrofitted_anew_vectors_word2vec.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(
        arg='zh_tw',
        filename="D:\Word_Embeddings\English\word2vec_out_vec_file.txt")
    amended_pos = load_pickle('./tmp/retrofitted_word2vec_pos.p')
    amended_neg = load_pickle('./tmp/retrofitted_word2vec_neg.p')
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
def build_ori_anew_vectors(words):
    filename = './tmp/anew_vectors_glove.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt")
    vecs = []
    for w in words:
        vecs.append(model[w])
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
예제 #21
0
def build_amended_anew_vectors(words):
    filename = './tmp/amended_anew_vectors_glove.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(
        arg='zh_tw',
        filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt")
    amended_pos = load_pickle('./tmp/amended_GloVe_pos.p')
    amended_neg = load_pickle('./tmp/amended_GloVe_neg.p')
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
예제 #22
0
def build_ori_anew_vectors(words):
    filename = './tmp/anew_vectors.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings('google_news',
                            '/home/hs/Data/Word_Embeddings/google_news.bin')
    vecs = []
    for w in words:
        vecs.append(model[w])
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
예제 #23
0
def build_ori_anew_vectors(words):
    filename = './tmp/anew_vectors.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(
        'google_news',
        'D:\Word_Embeddings\English\GoogleNews-vectors-negative300.bin')
    vecs = []
    for w in words:
        vecs.append(model[w])
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
예제 #24
0
def build_ori_anew_vectors(words):
    filename = './tmp/anew_vectors_retrofitted_glove.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(
        arg='zh_tw',
        filename="D:\Word_Embeddings\English\glove.6B\GloVe_out_vec_file.txt")
    vecs = []
    for w in words:
        vecs.append(model[w])
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info('SVM classifier training complete, saved predict labels to pickle')
    return


def logit(train_data, train_labels, test):
    log_state('Use logistic regression classifier')
    clf = linear_model.LogisticRegression(C=1e5)
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info('MaxEnt classifier training complete, saved predict labels to pickle')
    return


def kNN(train_data, train_labels, test):
    log_state('Use kNN classifier')
    clf = KNeighborsClassifier(n_neighbors=5)
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info('kNN classifier training complete, saved predict labels to pickle')
    return

if __name__ == "__main__":
    train_data = load_pickle('./data/transformed_data/transformed_train.p')
    test = load_pickle('./data/transformed_data/transformed_test.p')
    _, train_labels = load_train_data()
    mNB(train_data, train_labels, test)
예제 #26
0
    for sent in corpus:
        for word in sent:
            vocab[word] += 1
    print(len(vocab))
    return vocab


########################################## config ########################################
vec_dim = 400
##########################################################################################
corpus = load_corpus(get_file_path('cn_corpus'))
# print(corpus[:2])
# vocab = get_vocab(corpus)
# dump_picle(vocab, get_file_path('CVAT_Vocab'))
# print('OK')
vocab = load_pickle(get_file_path('CVAT_Vocab'))
# for i in vocab:
#     print(i)
# print(len(vocab))

# W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400)
# dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT'))
# print('dump word_idx_map successful')
# dump_picle(W, '/home/hs/Data/embedding_matrix_CVAT.p')
# print('OK')

word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT'))
mark = load_mark(get_file_path('mark'))
valence, arousal = gold_valence_arousal(corpus, mark)
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)
예제 #27
0
    data = np.concatenate((pos_idx_data, neg_idx_data), axis=0)
    print(data.shape)
    return data, pos_length, neg_length


if __name__ == '__main__':
    ########################################## config ########################################
    file_dir = 'E:/研究/Data/IMDB/aclImdb/train/' if os.name == 'nt' else '/home/hs/Data/imdb/aclImdb/train/'
    vec_dim = 300
    ##########################################################################################

    # get vocab and save to pickle
    vocab = get_vocab(file_dir)
    dump_picle(vocab, './data/tmp/vocab.p')
    print('OK')
    a = load_pickle('./data/tmp/vocab.p')
    for i in a:
        print(i)
    print(len(a))
    exit()
    # end

    # make word index map
    vocab = load_pickle('./data/tmp/vocab.p')
    W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300)
    dump_picle(word_idx_map, get_file_path('word_idx_map'))
    print('dump word_idx_map successful')
    dump_picle(W, '/home/hs/Data/embedding_matrix.p')
    print('OK')
    exit()
    # make word index map end
예제 #28
0
    data = np.concatenate((pos_idx_data, neg_idx_data), axis=0)
    print(data.shape)
    return data, pos_length, neg_length


if __name__ == '__main__':
    ########################################## config ########################################
    file_dir = 'E:/研究/Data/IMDB/aclImdb/train/' if os.name == 'nt' else '/home/hs/Data/imdb/aclImdb/train/'
    vec_dim = 300
    ##########################################################################################

    # get vocab and save to pickle
    vocab = get_vocab(file_dir)
    dump_picle(vocab, './data/tmp/vocab.p')
    print('OK')
    a = load_pickle('./data/tmp/vocab.p')
    for i in a:
        print(i)
    print(len(a))
    exit()
    # end

    # make word index map
    vocab = load_pickle('./data/tmp/vocab.p')
    W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'),
                                             vocab,
                                             k=300)
    dump_picle(word_idx_map, get_file_path('word_idx_map'))
    print('dump word_idx_map successful')
    dump_picle(W, '/home/hs/Data/embedding_matrix.p')
    print('OK')
예제 #29
0
          (precision_binary, recall_binary, fbeta_score_binary))
    log_performance(accuracy, f1, precision_binary, recall_binary, len(true))
    if figure == False:
        return
    # 画图
    n_groups = 5
    values = (accuracy, f1, precision_binary, recall_binary,
              fbeta_score_binary)
    fig, ax = plt.subplots()
    index = np.arange(n_groups)
    bar_width = 0.35
    rects1 = plt.bar(index + bar_width / 2,
                     values,
                     bar_width,
                     alpha=0.6,
                     color='b')
    plt.xlabel('Result')
    plt.ylabel('Scores')
    plt.title('Experiment analysis')
    plt.xticks(index + bar_width,
               ('Accuracy', 'F', 'Precision', 'Recall', 'F'))
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.show()


if __name__ == "__main__":
    predict = load_pickle('./data/predict_labels/predict_labels.p')
    _, true_labels = load_test_data()
    analysis_result(predict, true_labels)
__author__ = 'NLP-PC'
from load_data import load_pickle
import nltk
from load_data import load_extend_anew

words, _, _ = load_extend_anew()

feature_names = load_pickle('./data/features/feature_names.p')
print(feature_names)
english_stemmer = nltk.stem.SnowballStemmer('english')
stemmed_dict = [english_stemmer.stem(w) for w in words]
print(len(stemmed_dict))
overlapping_words = (set(feature_names) & set(stemmed_dict))
print(len(overlapping_words))
print(english_stemmer.stem(''))
features = load_pickle('./data/transformed_data/transformed_train.p')
print(features[1, 249])
print(type(features))

d = 'We are very nice goes I am nicely'
sent = list(d.split())
print(sent)
stemmed_sent = [english_stemmer.stem(w) for w in sent]
print(stemmed_sent)
def get_randomized_speed_profiles(experiment, a, sessions, filterpath):
    avgSpeeds_allsess = []
    trialTypes_allsess = []

    # Select Session
    for s in sessions:

        # Load Valid Trial Filter (manually sorted)
        validFilename = filterpath + r'\valid_a' + str(a) + '_s' + str(
            s) + '.pickle'
        valid_trials = load_data.load_pickle(validFilename)
        numTrials = np.size(valid_trials)

        # There is some misalignment for sessions on the last 4 animals..there session 0 is other session 1
        if a >= 10:
            s = s - 1

        # Set Trial Lables
        labelfilter1 = {'state': 'stable'}
        labelfilter2 = {'state': 'unstable'}
        labelFilters = [labelfilter1, labelfilter2]

        # Look at all Trajectories
        trajectories = experiment[a][s].trajectories
        times = experiment[a][s].time
        slices = experiment[a][s].slices
        labels = experiment[a][s].labels
        steps = experiment[a][s].steps
        speeds = experiment[a][s].speeds

        print str.format('a:s {0}:{1} {2} {3}', a, s, numTrials, len(slices))

        # Set Valid Trials (No exploration or tracking errors)
        crossings = valid_trials

        # Set Binning and Range
        avgSpeeds = np.zeros((numTrials, numBins))
        trialTypes = np.zeros((numTrials, 1))
        for t in range(0, numTrials):

            #label_indices = np.array(pt.get_labeled_indices(labels,labelFilters[l]))
            c = crossings[t]

            # Load X Trajectories and flip all of 'Left'
            trialX = trajectories[slices[c], 0]
            if utils.is_dict_subset({'direction': 'left'}, labels[c]):
                # ALign on 2 important rails (the center of rail 3 is 550)
                # and the centr of rail 4 is 737, therefore, the first encounter
                # is at 550 going "right", and when flipped, (1280-737 = 543)
                # going "left"...therefore, to correct for the shift, I subteact 1273
                # and align the left and right trials
                trialX = np.abs(trialX - 1273)

            # Load Y Trajectories
            trialY = trajectories[slices[c], 1]

            # Load and Parse Times
            trialTstrings = times[slices[c]]
            trialT = np.array([
                dateutil.parser.parse(timeString)
                for timeString in trialTstrings
            ])

            # Measure Progression Speed
            diffX = np.diff(trialX)
            diffT = time_diff(trialT) / 1000000  # Time interval in seconds
            speedX = np.concatenate((np.zeros(1), diffX / diffT))

            # Find enter/exit and crop trials
            indR = np.where(trialX > 1200)
            indL = np.where(trialX < 150)
            if (np.size(indR) > 0) and (np.size(indL) > 0):
                exitInd = indR[0][0] + 1
                enterInd = indL[0][-1]

            trialX = trialX[enterInd:exitInd]
            trialY = trialY[enterInd:exitInd]
            speedX = speedX[enterInd:exitInd]

            # Bin (progrssion - X) Speed Profiles (from position 200 to 1200)
            for b in range(0, numBins):
                bins = np.where((trialX >= (200 + (b * binSize)))
                                & (trialX < (200 + (b * binSize) + binSize)))
                if np.size(bins) > 0:
                    avgSpeeds[t, b] = np.mean(speedX[bins])
                else:
                    avgSpeeds[t, b] = np.NaN

            # Correct for starting speed - - first Third of assay
            baseSpeed = stats.nanmean(avgSpeeds[t, 0:14])
            avgSpeeds[t, :] = avgSpeeds[t, :] / baseSpeed

            # Get Lables
            label = labels[c]

            if utils.is_dict_subset({'state': 'stable'}, label):
                trialTypes[t] = 0
            else:
                trialTypes[t] = 1

        # Pool All Average Speeds/TrialTypes Across Sessions
        avgSpeeds_allsess.append(avgSpeeds)
        trialTypes_allsess.append(trialTypes)

    avgSpeeds = np.concatenate(avgSpeeds_allsess)
    trialTypes = np.concatenate(trialTypes_allsess)
    return avgSpeeds, trialTypes
예제 #32
0
    return


def logit(train_data, train_labels, test):
    log_state('Use logistic regression classifier')
    clf = linear_model.LogisticRegression(C=1e5)
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info(
        'MaxEnt classifier training complete, saved predict labels to pickle')
    return


def kNN(train_data, train_labels, test):
    log_state('Use kNN classifier')
    clf = KNeighborsClassifier(n_neighbors=5)
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info(
        'kNN classifier training complete, saved predict labels to pickle')
    return


if __name__ == "__main__":
    train_data = load_pickle('./data/transformed_data/transformed_train.p')
    test = load_pickle('./data/transformed_data/transformed_test.p')
    _, train_labels = load_train_data()
    mNB(train_data, train_labels, test)
    f1 = f1_score(true, predict, average="binary")
    precision_binary, recall_binary, fbeta_score_binary, _ = precision_recall_fscore_support(
        true, predict, average="binary"
    )
    accuracy = accuracy_score(true, predict)
    print("正确率(Accuracy):%.3f\nF值(Macro-F score):%.3f" % (accuracy, f1))
    print("精确度(Precision):%.3f\n召回率:%.3f\nF值: %.3f" % (precision_binary, recall_binary, fbeta_score_binary))
    log_performance(accuracy, f1, precision_binary, recall_binary, len(true))
    if figure == False:
        return
    # 画图
    n_groups = 5
    values = (accuracy, f1, precision_binary, recall_binary, fbeta_score_binary)
    fig, ax = plt.subplots()
    index = np.arange(n_groups)
    bar_width = 0.35
    rects1 = plt.bar(index + bar_width / 2, values, bar_width, alpha=0.6, color="b")
    plt.xlabel("Result")
    plt.ylabel("Scores")
    plt.title("Experiment analysis")
    plt.xticks(index + bar_width, ("Accuracy", "F", "Precision", "Recall", "F"))
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.show()


if __name__ == "__main__":
    predict = load_pickle("./data/predict_labels/predict_labels.p")
    _, true_labels = load_test_data()
    analysis_result(predict, true_labels)
예제 #34
0
import process_trajectories as proctraj

plt.close('all')

# Set Base Path
base_path = r'C:/Users/gonca_000/Documents/Insync/[email protected]'
#base_path = r'C:\kampff\Insync\[email protected]'
#base_path = r'D:\kampff\Insync\[email protected]'
#base_path = r'C:\kampff\Insync'

# Set Figure Directory
saveDirectory = r'C:\Users\gonca_000\Desktop\All Trajectories'

# Load and process the 'pickled' Trajectories (Week 3 only)
if not 'experiment' in locals():
    experiment = load_data.load_pickle(base_path + r'\protocols\shuttling\data\trajectories_week3.pickle')

# Set Animal
#animals = [1, 4, 5, 12, 13, 2, 3, 10, 11, 8, 9, 6, 7]
#names = ['Ca', 'Lb', 'Cb', 'Lc', 'Cc', 'Ld', 'Cd' ,'Le', 'Ce', 'Lf', 'Cf', 'Lg', 'Cg']

# Set Trial Validation Directory
validationpath = base_path + r'\protocols\shuttling\ARK\MC Lesion-Sham Analysis\Figures\Figure 3\Valid Trials'

############# Figure 3b - Example Profiles ######################
animals = [4, 5]
names = ['Lb', 'Cb']
sessions = [1, 2, 3]

profiles = [figutils.get_randomized_speed_profiles(experiment,a,sessions,validationpath) for a in animals]
[figutils.plot_randomized_speed_profiles(avgSpeeds,trialTypes) for avgSpeeds,trialTypes in profiles]
def keras_nn_input(word_vectors_model, amending):
    if word_vectors_model == 'word2vec':
        if amending == True:
            filename_data, filename_w = './tmp/amended_w2v_indexed_data.p', './tmp/amended_w2v_Weight.p'
        elif amending == False:
            filename_data, filename_w = './tmp/w2v_indexed_data.p', './tmp/w2v_Weight.p'
        else:
            raise Exception('Wrong!')
    elif word_vectors_model == 'GloVe':
        if amending == True:
            filename_data, filename_w = './tmp/amended_GloVe_indexed_data.p', './tmp/amended_GloVe_Weight.p'
        elif amending == False:
            filename_data, filename_w = './tmp/GloVe_indexed_data.p', './tmp/GloVe_Weight.p'
        else:
            raise Exception('Wrong!')
    else:
        raise Exception('Wrong parameter!')

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending))
        return (data, W)

    # load data from pickle
    (x_train, y_train_valence, y_train_labels,
     x_test, y_test_valence, y_test_labels,
     x_valid, y_valid_valence, y_valid_labels,
     x_train_polarity, y_train_polarity,
     x_test_polarity, y_test_polarity,
     x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/')

    vocab = get_vocab(x_train)

    if word_vectors_model == 'word2vec':
        if amending == True:
            word_vecs = load_embeddings('amended_word2vec')
        elif amending == False:
            word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
        else:
            raise Exception('Wrong!')
    elif word_vectors_model == 'GloVe':
        if amending == True:
            word_vecs = load_embeddings('amended_glove')
        elif amending == False:
            word_vecs = load_embeddings('glove')
        else:
            raise Exception('Wrong!')
    else:
        raise Exception('Wrong parameter!')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    x_train_idx_data = make_idx_data(x_train, word_idx_map)
    x_test_idx_data = make_idx_data(x_test, word_idx_map)
    x_valid_idx_data = make_idx_data(x_valid, word_idx_map)
    x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map)
    x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map)
    x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map)

    data = (x_train_idx_data, y_train_valence, y_train_labels,
            x_test_idx_data, y_test_valence, y_test_labels,
            x_valid_idx_data, y_valid_valence, y_valid_labels,
            x_train_polarity_idx_data, y_train_polarity,
            x_test_polarity_idx_data, y_test_polarity,
            x_valid_polarity_idx_data, y_valid_polarity)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending))
    return (data, W)
예제 #36
0
def convert(source_file):
    s = load_pickle(source_file)
    dump_picle(s, str(source_file)[:-2] + '_v2.7.p', protocol=2)
import os
import pandas
import itertools
import load_data
import numpy as np
import video_player
import subprocess
import process_trajectories
import plot_utilities as pltutils
import matplotlib.pyplot as plt

if not 'data' in locals():
    #    data = load_data.load_pickle(r'G:/Homework/trajectories.pickle')
    data = load_data.load_pickle(
        r'C:/Users/gonca_000/Documents/Insync/[email protected]/protocols/shuttling/data/trajectories_week1.pickle'
    )
    process_trajectories.rebase_video_path(data, 'D:')

width_pixel_to_cm = 50.0 / 1280.0
frames_per_second = 120.0

crop = [100, 1100]
traj = data[0][4].trajectories
slices = process_trajectories.clump_trajectories(traj, crop)
trajectory_interval = [len(traj[x, 0]) / frames_per_second for x in slices[1:]]
progression_speed = [
    np.diff(traj[x, 0]) * width_pixel_to_cm * frames_per_second
    for x in slices[1:]
]
average_speed = [np.mean(s) for s in progression_speed]
예제 #38
0
def load_dataset(pathlist, sessionslice=slice(None)):
    return [load_data.load_pickle(path)[sessionslice] for path in pathlist]
예제 #39
0
__author__ = 'NLP-PC'

from load_data import load_pickle
from file_name import get_file_path
from evaluate import evaluate

(Y_test, predict) = load_pickle('./data/corpus/vader/cnn_movie_news_articles.p')
print(Y_test)
print(predict)
evaluate(Y_test, predict, 'news_articles')

(Y_test, predict) = load_pickle('./data/corpus/vader/cnn_movie_news_articles1.p')
print(Y_test)
print(predict)
evaluate(Y_test, predict, 'news_articles')

(Y_test, predict) = load_pickle('./data/corpus/vader/cnn_movie_news_articles2.p')
print(Y_test)
print(predict)
evaluate(Y_test, predict, 'news_articles')

(Y_test, predict) = load_pickle('./data/corpus/vader/cnn_movie_news_articles3.p')
print(Y_test)
print(predict)
evaluate(Y_test, predict, 'news_articles')

(Y_test, predict) = load_pickle('./data/corpus/vader/cnn_movie_news_articles4.p')
print(Y_test)
print(predict)
evaluate(Y_test, predict, 'news_articles')
def analysis_preprocess():
    preprocessed = load_pickle('./data/acc/labeled_data.p')
    for id, i in enumerate(preprocessed):
        print('| %s | %s |' % (id, i))
__author__ = 'NLP-PC'
from load_data import load_pickle
import nltk
from load_data import load_extend_anew

words, _, _=load_extend_anew()

feature_names = load_pickle('./data/features/feature_names.p')
print(feature_names)
english_stemmer=nltk.stem.SnowballStemmer('english')
stemmed_dict = [english_stemmer.stem(w) for w in words]
print(len(stemmed_dict))
overlapping_words= (set(feature_names) & set(stemmed_dict))
print(len(overlapping_words))
print(english_stemmer.stem(''))
features = load_pickle('./data/transformed_data/transformed_train.p')
print(features[1,249])
print(type(features))

d='We are very nice goes I am nicely'
sent = list(d.split())
print(sent)
stemmed_sent = [english_stemmer.stem(w) for w in sent]
print(stemmed_sent)
예제 #42
0
    def train(self, config):
        #####Train DCGAN####

        global_step = tf.Variable(0,name='global_step',trainable=False)
        global_step1 = tf.Variable(0,name='global_step1',trainable=False)

        g_optim = tf.train.AdamOptimizer(config.learning_rate,beta1=config.beta1) \
                          .minimize(self.gen_loss, global_step=global_step,var_list=self.g_vars)
	if self.dis_loss:	
	    d_optim = tf.train.AdamOptimizer(config.learning_rate,beta1=config.beta1) \
                          .minimize(self.d_loss, global_step=global_step1,var_list=self.d_vars)

	tf.initialize_all_variables().run()
	
        start_time = time.time()

        if self.load(self.checkpoint_dir):
            print(" [*] Load SUCCESS")
        else:
            print(" [!] Load failed...")

        # loda training and validation dataset path
	dataset = load_pickle()
	train_input = dataset['train_input']
	train_gt = dataset['train_gt']
	val_input = dataset['val_input']
	val_gt = dataset['val_gt']
	S = range(len(train_input))
	shuffle(S)
	SS = range(len(train_input[0]))
	shuffle(SS) 
        list_val = [11,16,21,22,33,36,38,53,59,92]


	if self.use_queue:
	    # creat thread
	    coord = tf.train.Coordinator()
            num_thread =1
            for i in range(num_thread):
 	        t = threading.Thread(target=self.load_and_enqueue,args=(coord,train_input,train_gt,S,SS,i,num_thread))
	 	t.start()

	if self.use_queue:
	    for epoch in xrange(config.epoch):
	        #shuffle = np.random.permutation(range(len(data)))
	        batch_idxs = min(len(train_input), config.train_size)/config.batch_size
		sum_L = 0.0
		sum_g =0.0
		sum_ang =0.0
		sum_low =0.0
		sum_high =0.0
		if epoch ==0:
		    train_log = open(os.path.join("logs",'train_%s.log' %config.dataset),'w')
		    val_log = open(os.path.join("logs",'val_%s.log' %config.dataset),'w')
		else:
	    	    train_log = open(os.path.join("logs",'train_%s.log' %config.dataset),'aw')
		    val_log = open(os.path.join("logs",'val_%s.log' %config.dataset),'w')

		for idx in xrange(0,batch_idxs):
        	     start_time = time.time()
		     if self.dis_loss:
		         _,d_loss_real,d_loss_fake =self.sess.run([d_optim,self.d_loss_real,self.d_loss_fake],feed_dict={self.keep_prob:self.dropout})
		     _,g_loss,ang_loss,L_loss,low_loss,high_loss =self.sess.run([g_optim,self.g_loss,self.ang_loss,self.L_loss,self.low_loss,self.high_loss],feed_dict={self.keep_prob:self.dropout})
		     print("Epoch: [%2d] [%4d/%4d] time: %4.4f g_loss: %.6f L_loss:%.4f ang_loss: %.6f low_loss: %.6f high_loss:%.6f" \
		     % (epoch, idx, batch_idxs,time.time() - start_time,g_loss,L_loss,ang_loss,low_loss,high_loss))
		     sum_L += L_loss 	
		     sum_g += g_loss
		     sum_ang += ang_loss
		     sum_low += low_loss
		     sum_high += high_loss

		train_log.write('epoch %06d mean_g %.6f  mean_L %.6f mean_ang %.6f mean_low %.6f mean_high %.6f\n' %(epoch,sum_g/(batch_idxs),sum_L/(batch_idxs),sum_ang/batch_idxs,sum_low/(batch_idxs),sum_high/batch_idxs))
		train_log.close()
	        self.save(config.checkpoint_dir,global_step)
		"""
		####### Validation #########
		for idx2 in xrange(0,len(list_val)):
		    for tilt in range(1,10):	
		        print("Epoch: [%2d] [%4d/%4d] " % (epoch, idx2, len(list_val)))
		        img = '/research2/IR_normal_small/save%03d/%d' % (list_val[idx2],tilt)
			light = random.randint(1,12)
			input_ = scipy.misc.imread(img+'/%d3.bmp' %light).astype(float)
			input_ = scipy.misc.imresize(input_,[600,800])
			input_ = input_/127.5 - 1.0
			input_ = np.reshape(input_,[1,600,800,1])
			gt_ = scipy.misc.imread(img+'/12_Normal.bmp').astype(float)
			gt_ = gt_/127.5 -1.0
 		        sample = self.sess.run([self.sample],feed_dict={self.ir_test: input_})
			L1_loss = tf.reduce_mean(tf.square(tf.sub(sample,gt_)))
			sum_L1 += L1_loss
			
		val_log.write('epoch %06d mean_L1 %.6f \n' %(epoch,sum_L1/(len(range(1,10)*len(list_val)))))
		val_log.close()
		"""
	else:
	    for epoch in xrange(config.epoch):
	         # loda training and validation dataset path
	         shuffle_ = np.random.permutation(range(len(data)))
	         batch_idxs = min(len(data), config.train_size)/config.batch_size
		    
	         for idx in xrange(0, batch_idxs):
        	     start_time = time.time()
		     batch_files = shuffle_[idx*config.batch_size:(idx+1)*config.batch_size]
    		     batches = [get_image(datalist[batch_file],labellist[batch_file],self.image_size,np.random.randint(64,224-64),\
					np.random.randint(64,224-64), is_crop=self.is_crop) for batch_file in batch_files]

		     batches = np.array(batches).astype(np.float32)
		     batch_images = np.reshape(batches[:,:,:,0],[config.batch_size,64,64,1])
		     batchlabel_images = np.reshape(batches[:,:,:,1:],[config.batch_size,64,64,3])
		     #mask_mean = batch_mask * self.mean_nir
		     #batch_images = batch_images- mask_mean
		     # Update Normal D network
		     _= self.sess.run([d_optim], feed_dict={self.ir_images: batch_images,self.normal_images:batchlabel_images })
		     self.writer.add_summary(summary_str, global_step.eval())

		     # Update NIR G network
		     _,g_loss,L1_loss = self.sess.run([g_optim,self.g_loss,self.L1_loss], feed_dict={ self.ir_images: batch_images,self.normal_images:batchlabel_images})
		     print("Epoch: [%2d] [%4d/%4d] time: %4.4f g_loss: %.6f L1_loss:%.4f" \
		     % (epoch, idx, batch_idxs,time.time() - start_time,g_loss,L1_loss,d_loss))
	         self.save(config.checkpoint_dir,global_step) 
예제 #43
0
from load_data import load_pickle


def build_devtest_submit(gold, predict):
    out = []
    with open(gold) as gf:
        for line in gf:
            out.append(line)
    cut_offs = [0.2, 0.4, 0.6, 0.8]

    with open('./resources/predict_cnn_lstm.txt', 'w') as o:
        for i, line in enumerate(out):
            score = predict[i]
            if score > cut_offs[3]:
                s = 2
            elif score > cut_offs[2]:
                s = 1
            elif score > cut_offs[1]:
                s = 0
            elif score > cut_offs[0]:
                s = -1
            else:
                s = -2
            print(line[:-1], i, predict[i], s)
            o.write('\t'.join(line.split('\t')[0:2]) + '\t' + str(s) + '\n')


(_, predict) = load_pickle("./tmp/submit_cnn_lstm.p")
gold = './resources/devtest_gold_FILE.tsv'
build_devtest_submit(gold, predict)
            for line in lexicon:
                if word == line:
                    count = count + 1
                    sum_valence = sum_valence + lexicon[line]
        return 5 if count == 0 else sum_valence / count

    for i, text in enumerate(corpus):
        V = VA_mean(text)
        valence_pred.append(V)
        valence_true.append(mark[i])
    print(valence_true[:200])
    print(valence_pred[:200])
    evaluate(valence_true, valence_pred, 'valence')


idfs = load_pickle('./data/vocab_idf.p')


def tfidf(t, d, D):
    d = d.split()
    tf = float(d.count(t)) / sum(d.count(w) for w in set(d))
    # idf = sp.log(float(len(D)) / (len([doc.split() for doc in D if t in doc.split()])))
    return tf * idfs[t]


def tf(t, d):
    d = d.split()
    tf = float(d.count(t)) / float(len(d))
    return tf

예제 #45
0
import csv
import time

from load_data import load_pickle

(test, submit_predict) = load_pickle("./tmp/submit_cnn_valid.p")
ids, topics, texts = test
print(len(submit_predict))
exit()
ratings = []
cut_offs = [0.2, 0.4, 0.6, 0.8]
# cut_offs = [0.125, 0.375, 0.625, 0.875]

for score in submit_predict:
    if score > cut_offs[3]:
        s = 2
    elif score > cut_offs[2]:
        s = 1
    elif score > cut_offs[1]:
        s = 0
    elif score > cut_offs[0]:
        s = -1
    else:
        s = -2
    ratings.append(s)

for t in texts:
    print(t)

timestr = time.strftime("%Y%m%d-%H%M%S")
path = "./tmp/submit" + str(timestr) + ".csv"
    print(list(predict)[:100])
    print(Y_test[:100])
    evaluate(list(predict), np.array(Y_test),
             'linear regression ' + 'Explained variance score: %.2f' % regr.score(X_test, Y_test))


def cv(data, target):
    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.2, random_state=10)
    linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='ordinary_least_squares')
    linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression')
    linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Bayesian_Regression')
    linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='SVR')
    linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='KNN_Reg')


def simple_evaluate(model):
    print('词语向量包含的词汇有: (前200个)')
    print(list(model.vocab.keys())[:200])
    print('看一看词向量是什么样子的:')
    print(model.docvecs[0])
    print(model.docvecs['L_SENT_4'])
    print('most_similar: ')
    print(model.most_similar('awesome'))
    print(model.most_similar('bad'))

if __name__ == "__main__":
    run_build_docvecs()  # only at the first time, you should run this
    X, Y = load_pickle('./data/acc/twitter_docvecs.p')
    Y = np.array(Y) + np.ones(len(Y), dtype=float) * 5
    cv(X, Y)
예제 #47
0
__author__ = 'nobody'
from load_data import load_pickle
from file_name import get_file_path

(Y_test, predict) = load_pickle('./data/corpus/vader/cnn_result.p')

from evaluate import evaluate

print(Y_test)
print(predict)
evaluate(Y_test, predict, 'Result of CNN')
예제 #48
0
__author__ = 'NLP-PC'
from load_data import load_pickle
from evaluate import evaluate
import random
from regression import linear_regression, linear_regression_multivariant
from sklearn import cross_validation

mean_ratings, tf_means, tfidf_means, geos, tf_geos, tfidf_geos, ratings = load_pickle('./data/vader_out.p')

# size = 720
# slice_idx = random.sample(range(len(ratings)), size)  # 从list中随机获取size个元素,作为一个片断返回
# mean_ratings, tf_means, tfidf_means, geos, tf_geos, tfidf_geos, ratings = mean_ratings[slice_idx], tf_means[slice_idx], \
#                                                                           tfidf_means[
#                                                                               slice_idx], geos[slice_idx], tf_geos[
#                                                                               slice_idx], tfidf_geos[slice_idx], \
#                                                                           ratings[slice_idx]


evaluate(ratings, mean_ratings, 'mean_ratings')
evaluate(ratings, tf_means, 'tf_means')
evaluate(ratings, tfidf_means, 'tfidf_means')
evaluate(ratings, geos, 'geos')
evaluate(ratings, tf_geos, 'tf_geos')
evaluate(ratings, tfidf_geos, 'tfidf_geos')

################################################ Regression Methods ##########################################
# X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(mean_ratings, ratings, test_size=0.2,
#                                                                      random_state=0)
# linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
# X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(tf_means, ratings, test_size=0.2, random_state=0)
# linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
예제 #49
0
__author__ = 'NLP-PC'
from load_data import load_pickle
from evaluate import evaluate
import random
from regression import linear_regression, linear_regression_multivariant
from sklearn import cross_validation

mean_ratings, tf_means, tfidf_means, geos, tf_geos, tfidf_geos, ratings = load_pickle(
    './data/vader_out.p')

# size = 720
# slice_idx = random.sample(range(len(ratings)), size)  # 从list中随机获取size个元素,作为一个片断返回
# mean_ratings, tf_means, tfidf_means, geos, tf_geos, tfidf_geos, ratings = mean_ratings[slice_idx], tf_means[slice_idx], \
#                                                                           tfidf_means[
#                                                                               slice_idx], geos[slice_idx], tf_geos[
#                                                                               slice_idx], tfidf_geos[slice_idx], \
#                                                                           ratings[slice_idx]

evaluate(ratings, mean_ratings, 'mean_ratings')
evaluate(ratings, tf_means, 'tf_means')
evaluate(ratings, tfidf_means, 'tfidf_means')
evaluate(ratings, geos, 'geos')
evaluate(ratings, tf_geos, 'tf_geos')
evaluate(ratings, tfidf_geos, 'tfidf_geos')

################################################ Regression Methods ##########################################
# X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(mean_ratings, ratings, test_size=0.2,
#                                                                      random_state=0)
# linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
# X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(tf_means, ratings, test_size=0.2, random_state=0)
# linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
예제 #50
0
            for line in lexicon:
                if word == line:
                    count = count + 1
                    sum_valence = sum_valence + lexicon[line]
        return 5 if count == 0 else sum_valence / count

    for i, text in enumerate(corpus):
        V = VA_mean(text)
        valence_mean.append(V)
        valence_true.append(mark[i])
    return valence_mean, valence_true


from load_data import load_pickle

idfs = load_pickle('./data/vocab_idf.p')


def tfidf(t, d):
    d = d.split()
    tf = float(d.count(t)) / sum(d.count(w) for w in set(d))
    # idf = sp.log(float(len(D)) / (len([doc.split() for doc in D if t in doc.split()])))
    return tf * idfs[t]


def tf(t, d):
    d = d.split()
    tf = float(d.count(t)) / float(len(d))
    return tf

예제 #51
0
    for sent in corpus:
        for word in sent:
            vocab[word] += 1
    print(len(vocab))
    return vocab


########################################## config ########################################
vec_dim = 400
##########################################################################################
corpus = load_corpus(get_file_path('cn_corpus'))
# print(corpus[:2])
# vocab = get_vocab(corpus)
# dump_picle(vocab, get_file_path('CVAT_Vocab'))
# print('OK')
vocab = load_pickle(get_file_path('CVAT_Vocab'))
# for i in vocab:
#     print(i)
# print(len(vocab))

# W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400)
# dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT'))
# print('dump word_idx_map successful')
# dump_picle(W, '/home/hs/Data/embedding_matrix_CVAT.p')
# print('OK')

word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT'))
mark = load_mark(get_file_path('mark'))
valence, arousal = gold_valence_arousal(corpus, mark)
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)
                                   Y_test,
                                   cost_fun='Bayesian_Regression')
    linear_regression_multivariant(X_train,
                                   X_test,
                                   Y_train,
                                   Y_test,
                                   cost_fun='SVR')
    linear_regression_multivariant(X_train,
                                   X_test,
                                   Y_train,
                                   Y_test,
                                   cost_fun='KNN_Reg')


def simple_evaluate(model):
    print('词语向量包含的词汇有: (前200个)')
    print(list(model.vocab.keys())[:200])
    print('看一看词向量是什么样子的:')
    print(model.docvecs[0])
    print(model.docvecs['L_SENT_4'])
    print('most_similar: ')
    print(model.most_similar('awesome'))
    print(model.most_similar('bad'))


if __name__ == "__main__":
    run_build_docvecs()  # only at the first time, you should run this
    X, Y = load_pickle('./data/acc/twitter_docvecs.p')
    Y = np.array(Y) + np.ones(len(Y), dtype=float) * 5
    cv(X, Y)
from load_data import load_pickle


def build_devtest_submit(gold, predict):
    out = []
    with open(gold) as gf:
        for line in gf:
            out.append(line)
    cut_offs = [0.2, 0.4, 0.6, 0.8]

    with open('./resources/predict_cnn_lstm.txt', 'w') as o:
        for i, line in enumerate(out):
            score = predict[i]
            if score > cut_offs[3]:
                s = 2
            elif score > cut_offs[2]:
                s = 1
            elif score > cut_offs[1]:
                s = 0
            elif score > cut_offs[0]:
                s = -1
            else:
                s = -2
            print(line[:-1], i, predict[i], s)
            o.write('\t'.join(line.split('\t')[0:2])+'\t'+str(s)+'\n')


(_, predict) = load_pickle("./tmp/submit_cnn_lstm.p")
gold = './resources/devtest_gold_FILE.tsv'
build_devtest_submit(gold, predict)