Exemplo n.º 1
0
def train_net():

    output, X, YY, keep_prob = model.model()

    def _onehot(lables):#one-hot编码
        return tf.one_hot(lables, depth=26, on_value=1.0, axis=2)
    Y = _onehot(YY)
    print(Y)
    #损失定义
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.reshape(Y, [-1, 26 * 4]), logits= output))

    # optimizer 选择
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
    # optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)
    predict = tf.reshape(output, [-1, 4, 26])
    max_idx_p = tf.argmax(predict, 2)
    max_idx_l = tf.argmax(tf.reshape(Y, [-1, 4, 26]), 2)
    correct_pred = tf.equal(max_idx_p, max_idx_l)

    accuracy = tf.reduce_mean(tf.reduce_min(tf.cast(correct_pred, tf.float32), axis=1))

    # 读取数据
    images, labels = preprocess.read_data(['deal/0.txt', 'deal/1.txt', 'deal/2.txt'])
    total, width = labels.shape

    image_test, label_test = preprocess.read_data(['deal/test.txt'])

    tf.summary.scalar("loss", loss)
    tf.summary.scalar("accuracy", accuracy)
    merged = tf.summary.merge_all()
    saver = tf.train.Saver()
    #训练过程
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        #训练信息写入tensorboard
        filewriter = tf.summary.FileWriter("logs/", graph=sess.graph)

        step = 0
        if os.listdir('./check_point'):
            ckpt = tf.train.latest_checkpoint('./check_point')
            # print(ckpt)
            # ckpt = './check_point/weight-3980'
            saver.restore(sess, ckpt)
            print('restore from the checkpoint: {0}'.format(ckpt))
            # images, labels = preprocess.read_data(['deal/0.txt', 'deal/1.txt', 'deal/2.txt'])
        while True:
            for i in range(int(total / batch_size)):
                start_index = (i * batch_size) % total
                image_batch = images[start_index: start_index + batch_size]
                label_batch = labels[start_index: start_index + batch_size]
                summary, _, loss_ = sess.run([merged ,optimizer, loss], feed_dict={X: image_batch, YY: label_batch, keep_prob: 0.75})
                print(step, 'loss: %f' %loss_)
                filewriter.add_summary(summary, step)
                step += 1

                if step % 10 == 0:
                    acc = sess.run(accuracy, feed_dict={X: image_test, YY: label_test, keep_prob: 1.0})
                    print('第%d步,在测试集上的准确率为 %.2f'%(step, acc))
                    if acc > 0.4:
                        saver.save(sess,'./check_point/weight', global_step= step)
Exemplo n.º 2
0
def dt():
    start_time = time.time()
    data_frame, data_discrete_info, data_continuous_info = preprocess.read_data(train_filename, discrete_keys,
                                                                                continuous_keys)
    test_frame, _, __ = preprocess.read_data(test_filename, discrete_keys, continuous_keys)
    # attributes = discrete_keys + continuous_keys
    tree = decision_tree.DecisionTree(data_frame, discrete_keys + continuous_keys, data_discrete_info,
                                      data_continuous_info, 'y')
    tree.build()
    # tree.show_tree()
    error_rate = tree.inference(test_frame)
    end_time = time.time()
    print("Time cost:", end_time - start_time)
    return error_rate
Exemplo n.º 3
0
def train_gmm():
    feature, label = read_data(config.DATA_DIR)
    feature, label = feature.reshape(feature.shape[0],
                                     -1).numpy(), label.numpy()
    train_feature, test_feature, train_label, test_label = train_test_split(
        feature, label)
    model_0 = GaussianMixture(n_components=3,
                              max_iter=100,
                              weights_init=[1 / 3, 1 / 3, 1 / 3],
                              random_state=42)
    model_1 = GaussianMixture(n_components=3,
                              max_iter=100,
                              weights_init=[1 / 3, 1 / 3, 1 / 3],
                              random_state=42)
    # model.means_init = numpy.array([train_feture[train_label == i].mean(axis=0)
    #                                 for i in range(2)])

    model_0.fit(train_feature[train_label == 0], train_label[train_label == 0])
    model_1.fit(train_feature[train_label == 1], train_label[train_label == 1])
    # pred = model.predict(test_feature)
    # for feat in test_feature:
    y_pred = []
    score_0 = model_0.score_samples(test_feature)
    score_1 = model_1.score_samples(test_feature)
    for i in range(len(score_1)):
        if score_0[i] > score_1[i]:
            y_pred.append(0)
        else:
            y_pred.append(1)

    # print(model_0.score_samples(test_feature), model_1.score_samples(test_feature))
    print(accuracy_score(test_label, y_pred))

    # recall and precision
    matrix = classification_report(test_label, y_pred)
    print("Classification report: \n", matrix)

    # Plot non-normalized confusion matrix
    np.set_printoptions(precision=2)
    con_matrix = confusion_matrix(test_label, y_pred)
    class_names = ["Non_cough", "Cough"]
    plt.figure()
    plot_confusion_matrix(con_matrix,
                          classes=class_names,
                          title='Confusion matrix, without normalization')

    # Plot normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(con_matrix,
                          classes=class_names,
                          normalize=True,
                          title='Normalized confusion matrix')

    plt.show()

    pickle.dump(model_0, open("resource/non_cough.pkl", 'wb'))
    pickle.dump(model_1, open("resource/cough.pkl", 'wb'))

    print(model_1.weights_)
    print(model_0.weights_)
Exemplo n.º 4
0
def get_train_data():
    tmp1, tmp2 = pp.read_data()
    S,A = pp.preprocess_data(tmp1, tmp2)
    S_train, A_train, _, _ = split_data(S,A)
    print("train size:  ", len(S_train))
    #save_trainsplit_data(S_train, A_train)
    return S_train, A_train
def predict(model, message):
    """Return prediction: float
	Returns closing stock price for given row
	"""
    train_df = read_data(TRAIN_DATA)
    dataset_test = pd.DataFrame(message, index=[0])
    dataset_test.columns = train_df.columns
    scaler = MinMaxScaler(feature_range=(0, 1))

    df = train_df.tail(300)

    # print('-----------------', df.tail(1))
    df_close = df['close']
    # print(df_close.head())
    # print(df_close.shape)
    df_close = scaler.fit_transform(np.array(df_close).reshape(-1, 1))
    X_test = np.array(df_close[len(df_close) - 300:, 0])
    temp_list = np.array(X_test).reshape(1, 300)
    # print(temp_list.shape)
    # # print(X_test.shape)
    # print('\n')
    # print('----------')
    X_test = X_test.reshape(temp_list.shape[0], temp_list.shape[1], 1)
    predicted_stock_price = model.predict(X_test)
    predicted_stock_price = scaler.inverse_transform(predicted_stock_price)
    train_df = train_df.append(dataset_test, ignore_index=True)
    train_df.to_csv(TRAIN_DATA, index=False)
    return predicted_stock_price
Exemplo n.º 6
0
def generator(wav_name_list, batch_size, sample_rate, peak_norm, voc_mode,
              bits, mu_law, wave_path, voc_pad, hop_length, voc_seq_len,
              preemphasis, n_fft, n_mels, win_length, max_db, ref_db, top_db):
    # generator只能进行一次生成,故需要while True来进行多个epoch的数据生成
    while True:
        # 每epoch将所有数据进行一次shuffle
        #order = np.random.choice(len(wav_name_list), len(wav_name_list), replace=False)
        #audio_data_path_list = [wav_name_list[i] for i in order]
        audio_data_path_list = wav_name_list
        batchs = len(wav_name_list) // batch_size
        for idx in range(batchs):
            #逐步取音频名
            wav_name_list2 = audio_data_path_list[idx * batch_size:(idx + 1) *
                                                  batch_size]

            #取音频数据
            input_mel, input_sig = read_data(wave_path, sample_rate, peak_norm,
                                             voc_mode, bits, mu_law,
                                             wav_name_list2, preemphasis,
                                             n_fft, n_mels, hop_length,
                                             win_length, max_db, ref_db,
                                             top_db)

            dataset = collate_vocoder(input_mel, input_sig, voc_seq_len,
                                      hop_length, voc_pad, voc_mode, bits)
            # input_mel = tf.convert_to_tensor(input_mel[0])
            # input_sig = tf.convert_to_tensor(input_sig[0])
        yield dataset
Exemplo n.º 7
0
def main():

    print('Restoring map...')
    enc_map = cPickle.load(open(encode_map, 'rb'))
    dec_map = cPickle.load(open(decode_map, 'rb'))
    vocab_size = len(dec_map)

    print('Bulid Dataset...')
    lines = read_data(predict_file)
    question_list = parse_input_data_list(lines, enc_map, 50, False)


    print('Bulid Model...')
    model = MemNet(vocab_size = vocab_size,
                    embed_size = 512,
                    n_hop = 6,
                    memory_size = 20,
                    sentence_size = 50,
                    option_size = 10)

    print('Bulid Solver...')
    solver = Solver(model, enc_map, dec_map,
                    eval_batch_size = 1,
                    test_record_path = './record/test/',
                    test_examples = 10000,
                    restore_path = './checkpoint/',
                    print_step = 5)

    answer = solver.predict(question_list)
    idx = [x for x in range(1, len(question_list)+1)]
    import pandas as pd
    df = pd.DataFrame(data={'answer':answer})
    df.index += 1
    df.to_csv('predict.csv', index=True, index_label='id')
Exemplo n.º 8
0
def train_svm():
    feature, label = read_data(config.DATA_DIR)
    train_feature, test_feature, train_label, test_label = train_test_split(
        feature, label)
    train_feature = train_feature.reshape(train_feature.shape[0], -1)
    test_feature = test_feature.reshape(test_feature.shape[0], -1)
    svm = SVC()
    svm.fit(train_feature.numpy(), train_label.numpy())
    y_pred = svm.predict(test_feature.numpy())
    #pickle.dump(svm, open("resource/gmm.pkl", 'wb'))
    print(accuracy_score(test_label.numpy(), y_pred))
    #print(confusion_matrix(test_label.numpy(), y_pred))
    print(y_pred)
    np.set_printoptions(precision=2)
    matrix = classification_report(test_label.numpy(), y_pred)
    print("Classification report: \n", matrix)

    # Plot non-normalized confusion matrix
    con_matrix = confusion_matrix(test_label.numpy(), y_pred)
    class_names = ["Non_cough", "Cough"]
    plt.figure()
    plot_confusion_matrix(con_matrix,
                          classes=class_names,
                          title='Confusion matrix, without normalization')

    # Plot normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(con_matrix,
                          classes=class_names,
                          normalize=True,
                          title='Normalized confusion matrix')

    plt.show()
Exemplo n.º 9
0
    def test_read_data(self):
        df = read_data(self.data_dir, self.bodies_file, self.stances_file)

        self.assert_valid_df(df)
        self.assertIsInstance(df.iloc[0]['text_a'], str)
        self.assertIsInstance(df.iloc[0]['text_b'], str)
        self.assertIn(df.iloc[0]['labels'], CLASSES.keys())
Exemplo n.º 10
0
def get_test_data():
    tmp1, tmp2 = pp.read_data()
    S,A = pp.preprocess_data(tmp1, tmp2)
    _,_,S_test, A_test = split_data(S,A)
    print("test size:  ", len(S_test))
    #save_testsplit_data(S_test, A_test)
    return S_test, A_test
Exemplo n.º 11
0
    def test_split_data(self):
        df = read_data(self.data_dir, self.bodies_file, self.stances_file)
        train_data, dev_data, test_data = split_data(df)

        for data in train_data, dev_data, test_data:
            self.assert_valid_df(data)

        self.assertLess(len(train_data), 0.9 * len(df))
        self.assertLess(len(dev_data), 0.1 * len(df))
        self.assertAlmostEqual(len(test_data), 0.1 * len(df), delta=100)
Exemplo n.º 12
0
Arquivo: rnet.py Projeto: wykxyz/rnet
def train(args):
    opt = json.load(open('models/config.json', 'r'))['rnet']
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    print('Reading data')
    dp = preprocess.read_data('train', opt)
    sess = tf.Session(config=config)
    it, enqueue_op = dp.provide(sess)

    rnet_model = model.RNet(opt)
    loss, pt, accu = rnet_model.build_model(it)
    avg_loss = tf.reduce_mean(loss)
    train_op = tf.train.AdadeltaOptimizer(1.0, rho=0.95, epsilon=1e-06).minimize(loss)

    # saving model
    saver = tf.train.Saver()

    startTime = time.time()
    with sess.as_default():
        sess.run(tf.global_variables_initializer())
        # start feeding threads
        coord = tf.train.Coordinator()
        threads = []

        for i in range(opt['num_threads']):
            t = Thread(target=feeder, args=(dp, sess, enqueue_op, coord, i, args.debug))
            t.start()
            threads.append(t)
        # start training
        for i in range(args.epochs):
            print('Training...{}th epoch'.format(i))
            training_time = int(dp.num_sample/dp.batch_size)
            for j in tqdm(range(training_time)):
                _, avg_loss_val, pt_val = sess.run([train_op, avg_loss, pt])
                if j % 100 == 0:
                    print('iter:{} - average loss:{}'.format(j, avg_loss_val))
            print('saving rnet_model{}.ckpt'.format(i))
            save_path = saver.save(sess, os.path.join(args.save_dir, 'rnet_model{}.ckpt'.format(i)))
        
        cancel_op = dp.q.close(cancel_pending_enqueues=True)
        sess.run(cancel_op)
        print('stopping feeders')
        coord.request_stop()
        coord.join(threads, ignore_live_threads=True)
    
    save_path = saver.save(sess, os.path.join(args.save_dir, 'rnet_model_final.ckpt'))
    
    sess.close()
    print('Training finished, took {} seconds'.format(time.time() - startTime))
Exemplo n.º 13
0
Arquivo: rnet.py Projeto: wykxyz/rnet
def evaluate(args):
    opt = json.load(open('models/config.json', 'r'))['rnet']
    config = tf.ConfigProto(inter_op_parallelism_threads=1,intra_op_parallelism_threads=1)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    saved_model = args.model_path
    
    EM = 0.0
    F1 = 0.0
    with sess.as_default():
        print('Reading data')
        dp = preprocess.read_data('dev', opt)
        it, enqueue_op = dp.provide(sess)
        rnet_model = model.RNet(opt)
        loss, pt, accu = rnet_model.build_model(it)
        dequeued_p, asi, aei = it['p'], it['asi'], it['aei']
        
         # restore model
        print('restoring model...')
        saver = tf.train.Saver()
        saver.restore(sess, saved_model)

        # start feeding threads
        coord = tf.train.Coordinator()

        threads = []
        for i in range(opt['num_threads']):
            t = Thread(target=feeder, args=(dp, sess, enqueue_op, coord, i, args.debug))
            t.start()
            threads.append(t)
        # start prediction
        print('Prediction starts')
        num_batch = int(dp.num_sample/dp.batch_size)
        for j in tqdm(range(num_batch)):
            pt_val, p_batch, asi_batch, aei_batch = sess.run([pt, dequeued_p, asi, aei])
            f1, em = 0.0, 0.0
            for k in range(len(p_batch)):
                paragraph = p_batch[k][0].decode('utf8').split(' ')
                true_start, true_end = asi_batch[k][0], aei_batch[k][0]
                pred_start, pred_end = pt_val[k][0], pt_val[k][1]
                pred_tokens = paragraph[pred_start:(pred_end+1)]
                true_tokens = paragraph[true_start:(true_end+1)]
                f1 += f1_score(' '.join(pred_tokens), ' '.join(true_tokens))
                em += exact_match_score(' '.join(pred_tokens), ' '.join(true_tokens))
            print('{}th batch | f1: {} | em: {}'.format(j, f1/len(p_batch), em/len(p_batch)))
            F1 += f1
            EM += em
        print('Evaluation complete, F1 score: {}, EM score: {}'.format(F1/dp.num_sample, EM/dp.num_sample))
Exemplo n.º 14
0
def mlp_data():
    # 将数据预处理成可以被mlp使用的数据形式

    # 打乱数据
    preprocess.cross_validation("train_set.csv", "train.csv", "test.csv", 0.1)
    # 读取数据
    train_data, discrete_values, continous_values = preprocess.read_data("train_set.csv", discrete_keys, continuous_keys)
    # 将离散数据转换成连续数据
    for key in discrete_values.keys():
        idx = 0
        for val in discrete_values[key]:
            train_data = train_data.replace(to_replace=val, value=idx)

    train_features = train_data[list(discrete_keys + continuous_keys)]
    train_ground_truth = train_data['y']

    return train_features, train_ground_truth
Exemplo n.º 15
0
def bayes_data(test_percentage=0.1):
    # 将数据处理为朴素贝叶斯可以处理的数据
    data, discrete_infos, continuous_infos = preprocess.read_data('train_set.csv', discrete_keys, continuous_keys)
    # 将数据进行打乱
    data_len = len(data)
    test_data_count = test_percentage * data_len
    data = shuffle(data)
    ground_truth = data['y']
    features = data[list(discrete_keys + continuous_keys)]
    # 将数据处理为数值数据,并且通过归一化将数据转换为非负的
    for discrete_key in discrete_keys:
        idx = 0
        for val in discrete_infos[discrete_key]:
            features = features.replace(to_replace=val, value=idx)
            idx += 1
    features = (features - features.min()) / (features.max() - features.min())


    return features, ground_truth
Exemplo n.º 16
0
def train_lstm():
    device = torch.dtest.pyevice(
        'cuda' if torch.cuda.is_available() else 'cpu')
    feature, label = read_data(config.DATA_DIR)
    train_feature, test_feature, train_label, test_label = train_test_split(
        feature, label)
    train_data = MyDataset(train_feature, train_label)
    test_data = MyDataset(train_feature, train_label)
    train_data = DataLoader(train_data, batch_size=16, shuffle=True)
    test_data = DataLoader(test_data, batch_size=16, shuffle=False)

    model = LSTMClassifier().to(device)
    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = Adam(params=model.parameters(), lr=0.001)

    for e in range(20):
        train_loss, train_acc = train(train_data, model, criterion, optimizer,
                                      device)
        test_loss, test_acc = test(test_data, model, criterion, device)
        print(train_loss, train_acc)
        print(test_loss, test_acc)
        print("----------------------------------------")
        torch.save(model,
                   os.path.join(config.MODEL_DIR, "model_{}.pt".format(e + 1)))
def accuracy(true_labels, predictions):
    true_pred = 0

    for i in range(len(predictions)):
        if np.argmax(predictions[i]) == np.argmax(
                true_labels[i]):  # if 1 is in same index with ground truth
            true_pred += 1

    return true_pred / len(predictions)


if __name__ == "__main__":

    #PROCESS THE DATA
    words, labels = read_data(path)
    sentences = create_samples(words, labels)
    train_x, train_y, test_x, test_y = split_data(sentences)

    # creating one-hot vector notation of labels. (Labels are given numeric)
    # [0 1] is PERSON
    # [1 0] is not PERSON
    new_train_y = np.zeros(shape=(len(train_y), output_size))
    new_test_y = np.zeros(shape=(len(test_y), output_size))

    for i in range(len(train_y)):
        new_train_y[i][int(train_y[i])] = 1

    for i in range(len(test_y)):
        new_test_y[i][int(test_y[i])] = 1
Exemplo n.º 18
0
"""
Created on Wed Jun 24 22:46:47 2020

@author: ISIL
"""
from preprocess import read_data, read_TFIDF, create_vectors, get_vectors, split_data, convert_list_to_nd_array
from TFIDF import compute_TFIDF
from logistic_regression import logistic_reg
from SVM import support_vector_machine

path = "./data"

if __name__ == "__main__":

    #PROCESS THE DATA
    sentences, labels = read_data(path)
    TFIDF = compute_TFIDF(sentences, path)  #calculate TFIDF values
    word_list, TFIDF = read_TFIDF(
        path)  #read unique words list and TFIDF values
    create_vectors(word_list, TFIDF, path)  #vectorize data

    NROWS = len(TFIDF)
    NCOLS = len(word_list)

    #vectorized corpus (premise+hypothesis)
    data = get_vectors(path, NROWS, NCOLS)  #get vectorized data

    y = convert_list_to_nd_array("y", labels)

    train_x, train_y, test_x, test_y = split_data(
        data, y, path)  #split %80 for training %20 for test
Exemplo n.º 19
0
                                    dec_x_lens)
        elif model.mode == "predict":
            _, loss = model.predict_step(enc_x, dec_x, dec_y, enc_x_lens,
                                         dec_x_lens)
        losses.append(loss)
        if (batch + 1) % 100 == 0:
            print("[{}] batch={:04d}, loss={:.4f}".format(
                datetime.datetime.now(), batch + 1, loss))
    avg_loss = np.mean(losses)
    return avg_loss


if __name__ == "__main__":
    time_start = time.time()
    vocabulary, vocabulary_reverse = load_vocab(args.data_path)
    train_examples, _ = read_data(args.train_file, args.max_utterance_len,
                                  args.max_example_len + 1)
    eval_examples, _ = read_data(args.eval_file, args.max_utterance_len,
                                 args.max_example_len + 1)
    if not os.path.exists(args.root_path):
        os.makedirs(args.root_path)

    tf.reset_default_graph()
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    session = tf.Session(config=tf_config)

    with tf.name_scope("Train"):
        with tf.variable_scope("Model", reuse=None):
            train_model = SEQ2SEQ(session, options, "train")
        with tf.variable_scope("Model", reuse=True):
            eval_model = SEQ2SEQ(session, options, "predict")
c_range = [round(0.1 * a, 1)
           for a in range(1, 10)] + [1] + list(range(10, 151, 10))
kernels = ['linear', 'rbf', 'sigmoid']  ###poly太花时间了,暂且去掉
############################################################################################
########################^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#################################
########################  take line-trans as feature   #################################
########################^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#################################
############################################################################################
print('take LINE-TRANS as feature')
wiki_src = '/home/liruihan/Desktop/data/wiki_entropy/wiki_line_trans/'
lyric_src = '/home/liruihan/Desktop/data/uni_entropy_trans/'
wa_src = '/home/liruihan/Desktop/data/wa_entropy/wa_line_trans/'
cm_src = '/home/liruihan/Desktop/data/cm_entropy/cm_line_trans/'
aozora_src = '/home/liruihan/Desktop/data/aozora_entropy/aozora_line_trans/'

wiki_line_trans = [e.split('\n')[:-1] for e in pre.read_data(wiki_src)]
lyric_line_trans = [
    e.split('\n')[:-1] for e in pre.read_data(lyric_src, 'num_') if e != '0\n'
]
#wa_line_trans = [e.split('\n')[:-1] for e in pre.read_data(wa_src)]
#cm_line_trans = [e.split('\n')[:-1] for e in pre.read_data(cm_src)]
aozora_line_trans = [e.split('\n')[:-1] for e in pre.read_data(aozora_src)]

wiki_line_percentile_entropy = []
for trans in wiki_line_trans:
    wiki_line_percentile_entropy.append(percentile_entropy(trans))
'''
wa_line_percentile_entropy = []
for trans in wa_line_trans:
    wa_line_percentile_entropy.append(percentile_entropy(trans))
'''
Exemplo n.º 21
0
Arquivo: lsa.py Projeto: lmy86263/pLSA

def lsa(X, number_of_topics):
    reduced_u, reduced_sigma, reduced_v = reduce_dimension(X, number_of_topics)
    word_topic_matrix = np.dot(reduced_u, reduced_sigma)
    topic_doc_matrix = np.dot(reduced_sigma, reduced_v)

    app_X = np.dot(np.dot(reduced_u, reduced_sigma), reduced_v)
    return word_topic_matrix, topic_doc_matrix, app_X


if __name__ == '__main__':
    files = glob.glob('./text/*.txt')
    documents = []
    for f in files:
        documents.append(read_data(f))

    documents, words = pre_process(documents)
    X = word_doc_matrix(words, documents)

    word_topic_matrix, topic_doc_matrix, app_X = lsa(X, 5)
    print(word_topic_matrix)
    print(topic_doc_matrix)
    print(app_X)






Exemplo n.º 22
0
        model["result"] = np.mean(correct)
        print(f"Accuracy {n+1}/{len(models)}:\t{model['result']}")

    return models

def save_accuracies(models):
    for model in models:
        results_savename = f"{config.gs_results_name}{model['ID']:04d}"
        with open(config.gs_directory + results_savename, "w+") as outfile:
            outfile.write(str(model["result"]))

if __name__ == "__main__":

    t0 = time()

    data = preprocess.read_data()
    data = preprocess.one_hot(data)
    data = preprocess.reshape_4D(data)

    params = {"kernel_size"     :   config.gs_kernel_size,
              "activation_hid"  :   config.gs_activation_hid,
              "activation_out"  :   config.gs_activation_out,
              "layers"          :   config.gs_layers,
              "layers_out"      :   [data["layers_out"]],
              "learning_rate"   :   config.gs_learning_rate,
              "epochs"          :   config.gs_epochs,
              "batch_size"      :   config.gs_batch_size}

    msg = "Requires cmdline arg 'load' or 'save'"
    if len(sys.argv) == 2:
        if sys.argv[1].lower() == "load":
Exemplo n.º 23
0
def launch_model():
    full_text = request.form['full_text']
    id_ = request.form['id']
    model_type = request.form['model_type']

    global BERT, JOINT, GRANU, MGN, NUM_TASK, MASKING, HIER
    BERT = model_type == BERT_PATH
    JOINT = model_type == JOINT_BERT_PATH
    GRANU = model_type == GRANU_BERT_PATH
    MGN = model_type == MGN_SIGM_BERT_PATH

    # either of the four variants:
    # BERT = False
    # JOINT = False
    # GRANU = False
    # MGN = True

    assert BERT or JOINT or GRANU or MGN
    assert not (BERT and JOINT) and not (BERT and GRANU) and not (BERT and MGN) \
           and not (JOINT and GRANU) and not (JOINT and MGN) and not (GRANU and MGN)

    # either of the two variants
    SIGMOID_ACTIVATION = True
    RELU_ACTIVATION = False
    assert not (SIGMOID_ACTIVATION and RELU_ACTIVATION) and (
        SIGMOID_ACTIVATION or RELU_ACTIVATION)

    if BERT:
        NUM_TASK = 1
        MASKING = 0
        HIER = 0
    elif JOINT:
        NUM_TASK = 2
        MASKING = 0
        HIER = 0
    elif GRANU:
        NUM_TASK = 2
        MASKING = 0
        HIER = 1
    elif MGN:
        NUM_TASK = 2
        MASKING = 1
        HIER = 0
    else:
        raise ValueError(
            "You should choose one of bert, joint, granu and mgn in options")

    dct = {
        'NUM_TASK': NUM_TASK,
        'MASKING': MASKING,
        'SIGMOID_ACTIVATION': SIGMOID_ACTIVATION,
        'HIER': HIER
    }
    model = load_model(model_type, **dct)

    if not id_:
        ids = get_existent_ids()
        id_ = random_module.randint(0, N)
        while id_ in ids:
            id_ = random_module.randint(0, N)
        with open(DIRECTORY_PREDICT.joinpath(f'article{id_}.txt'),
                  'w',
                  encoding='utf-8') as f:
            f.write(full_text)

    text = overwrite_one_article(id_, directory=DIRECTORY_PREDICT)

    my_predict_dataset = PropDataset(DIRECTORY_PREDICT, is_test=True)
    my_predict_iter = data.DataLoader(dataset=my_predict_dataset,
                                      batch_size=BATCH_SIZE,
                                      shuffle=False,
                                      num_workers=1,
                                      collate_fn=pad)

    tmp_file = 'tmp.txt'
    eval(model,
         my_predict_iter,
         tmp_file,
         criterion,
         binary_criterion,
         NUM_TASK=NUM_TASK)
    ids, texts = read_data(DIRECTORY_PREDICT, is_test=True)
    t_texts = clean_text(texts, ids)
    flat_texts = [sentence for article in t_texts for sentence in article]
    fi, prop_sents = convert(NUM_TASK - 1, flat_texts, tmp_file)
    prop_sents = prop_sents[id_]
    prop_sents = ['1' if elem else '' for elem in prop_sents]

    results = remove_duplicates(fi)

    DIRECTORY_PREDICT.joinpath(f'article{id_}.txt').rename(
        DIRECTORY_MARKUP.joinpath(f'article{id_}.txt'))

    lst = [set() for _ in range(len(full_text))]
    source_lst = [set() for _ in range(len(full_text))]
    for inner_lst in results:
        for i in range(inner_lst[-2], inner_lst[-1]):
            lst[i].add(HUMAN_READABLE_TECHNIQUES[TECHNIQUES.index(
                inner_lst[-3])])
            source_lst[i].add(inner_lst[-3])

    extracts_s_e = []
    extracts = []
    categories = []
    for elem in fi:
        if elem[0] != str(id_):
            continue
        _, category, start, end = elem
        extracts_s_e.append((start, end))
        extracts.append(text[start:end])
        categories.append(category)

    extracts = [
        ' '.join(normalize(extract.strip())) for extract in extracts if extract
    ]
    print(f'extracts: {extracts}')

    # CHECK
    # extracts = [word for sent in extracts for word in sent.split()]

    test_x, test_maxlen = get_data(extracts,
                                   vocab_size=args.vocab_size,
                                   maxlen=args.maxlen)
    test_x = sequence.pad_sequences(test_x,
                                    maxlen=max(train_maxlen, test_maxlen))

    test_length = test_x.shape[0]
    splits = []
    for i in range(1, test_length // args.batch_size):
        splits.append(args.batch_size * i)
    if test_length % args.batch_size:
        splits += [(test_length // args.batch_size) * args.batch_size]
    test_x = np.split(test_x, splits)

    with graph.as_default():
        aspect_model = keras_load_model(os.path.join('flask_app', 'output',
                                                     'reviews', 'model_param'),
                                        custom_objects={
                                            "Attention": Attention,
                                            "Average": Average,
                                            "WeightedSum": WeightedSum,
                                            "MaxMargin": MaxMargin,
                                            "WeightedAspectEmb":
                                            WeightedAspectEmb,
                                            "max_margin_loss":
                                            U.max_margin_loss
                                        },
                                        compile=True)

        test_fn = K.function([
            aspect_model.get_layer('sentence_input').input,
            K.learning_phase()
        ], [
            aspect_model.get_layer('att_weights').output,
            aspect_model.get_layer('p_t').output
        ])
        aspect_probs = []

        for batch in tqdm(test_x):
            _, cur_aspect_probs = test_fn([batch, 0])
            aspect_probs.append(cur_aspect_probs)

        aspect_probs = np.concatenate(aspect_probs)

        label_ids = np.argsort(aspect_probs, axis=1)[:, -5:]
        for i, labels in enumerate(label_ids):
            print(
                f'{extracts[i]}: {[aspects[label] for label in labels][::-1]}')

    correct_lst = ['; '.join(list(elem)) for elem in lst]
    commands = {
        extract: ([aspects[label] for label in label_ids[i]][::-1], [])
        for i, extract in enumerate(extracts)
    }
    write_existent_dict(id_, source_lst, directory=DIRECTORY_MARKUP)

    for f in glob.glob(f'{DIRECTORY_PREDICT}/*'):
        os.remove(f)

    return jsonify(
        result={
            'id': id_,
            'list': correct_lst,
            'text': text,
            'prop_sents': prop_sents,
            'commands': commands
        })
Exemplo n.º 24
0
def create_target_mapping(json_file_names, word_file_name, labels_file_name, word2id, vocabulary):
    """ Create the target sequence mappings. In order to map the target sequence
        to the orginal sequence the tree ids from the json files are needed ot 
        know which words were kept and which were removed.
        
    Args:
        json_file_names: A list of json files that contain the data.
        word_file_name: The name of the text file containing the words.
        labels_file_name: The text file with the correct sentence compressions.
        word2id: The mapping from a word to an integer.
        vocabulary: The word vocabulary.
        
    Returns:
        target_seq_id: A list of sentences, where each of the words
            is mapped to an integer. It has the same dimesionality 
            of word_seq_id. A zero indicates that the word has been
            removed in the compression.
    """
    target_seq_id = []
    target_seq = read_file(labels_file_name, to_lower=True, replace_int=True)
    word_seq = read_file(word_file_name, to_lower=True, replace_int=True)
    sent_index = 0
    
    # Iterate files
    for file_name in json_file_names:
        json_object = preprocess.read_data(file_name)
        
        # Iterate sentences in a file
        for sentence in json_object['sentences']:
            word_dict = preprocess.create_word_dict(sentence)
            # Skip ('ROOT ') entry at [0]
            sent_word_ids = list(word_dict.keys())[1:]
            compression_word_ids = get_compression_word_ids(
                sentence['compression_untransformed'])
            # Vocabulary only has preprocessed words not the original words
            preprocessed_sent = word_seq[sent_index]
            target_sent_id = []
            # Append <bos> id
            target_sent_id.append(word2id[preprocessed_sent[0]])
            word_index = 1

            # Iterate words in a sentence
            # Preprocessed sentence may be shorter than original.
            # The preprocessed sentence contains <bos> and <eos>
            # Iterate until one word before <eos> which should be ('.'), 
            # reduce length by 3
            # Skip last entry ('.') and add until the end
            for word_id in sent_word_ids[:len(preprocessed_sent)-3]:
                word = preprocessed_sent[word_index]
                if word_id in compression_word_ids:
                    if word in vocabulary:
                        target_sent_id.append(word2id[word])
                    else:
                        target_sent_id.append(len(vocabulary)+1)
                else:
                    target_sent_id.append(0)
                word_index += 1

            # Append ('.') id, its in the word ids but not in compressed ids
            # It may not be a ('.'), first check to make sure
            if preprocessed_sent[-2] in vocabulary:
                target_sent_id.append(word2id[preprocessed_sent[-2]])
            else:
                target_sent_id.append(len(vocabulary)+1)
            # Append <eos> id
            target_sent_id.append(word2id[preprocessed_sent[-1]]) 
            target_seq_id.append(target_sent_id)
            sent_index += 1
            
    return target_seq_id
Exemplo n.º 25
0
    scores = np.zeros([1, 2])
    for x in list(enumerate(grid)):
        for y in list(enumerate(x[1])):
            d = y[1]
            if len(d) == 0:
                continue
            print('(%d, %d)' % (x[0] * 2 - 90, y[0] * 2 - 180))
            curmod = train_and_test(d[:, 2], d[:, 3])
            if curmod:
                models[x[0]][y[0]] = curmod
                scores = np.append(scores, [[curmod[1], curmod[2]]], axis=0)

    scores = np.delete(scores, 0, axis=0)
    print(' Done')
    print('')
    print('used last %d days of data for training' % days)
    print('models trained: %d' % len(scores))
    print('avg train set size per model: %f, min %d, max %d' %
          (np.mean(scores[:, 1]), np.min(scores[:, 1]), np.max(scores[:, 1])))
    print('total R-squared median: %f' % np.median(scores[:, 0]))


data = read_data()

train(data, days)

#data = data[data[:,0].argsort()] # sort by latitude
#data = np.array_split(data, n_models, axis=0)
#metadata = np.asarray([(a.min(axis=0)[0],a.max(axis=0)[0]) for a in data])
#data = data[data[:,2].argsort()] # sort by time
c_range = [round(0.1 * a,1) for a in range(1, 10)] + [1] + list(range(10, 151, 10))
kernels = ['linear','rbf', 'sigmoid']###poly太花时间了,暂且去掉
############################################################################################
########################^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#################################
########################  take line-trans as feature   #################################
########################^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#################################
############################################################################################
print('take LINE-TRANS as feature')
wiki_src = '/home/liruihan/Desktop/data/wiki_entropy/wiki_line_trans/'
lyric_src = '/home/liruihan/Desktop/data/uni_entropy_trans/'
wa_src = '/home/liruihan/Desktop/data/wa_entropy/wa_line_trans/'
cm_src = '/home/liruihan/Desktop/data/cm_entropy/cm_line_trans/'
aozora_src = '/home/liruihan/Desktop/data/aozora_entropy/aozora_line_trans/'

wiki_line_trans = [e.split('\n')[:-1] for e in pre.read_data(wiki_src)]
lyric_line_trans = [e.split('\n')[:-1] for e in pre.read_data(lyric_src,'num_')]
wa_line_trans = [e.split('\n')[:-1] for e in pre.read_data(wa_src)]
cm_line_trans = [e.split('\n')[:-1] for e in pre.read_data(cm_src)]
aozora_line_trans = [e.split('\n')[:-1] for e in pre.read_data(aozora_src)]

wiki_line_percentile_entropy = []
for trans in wiki_line_trans:
    wiki_line_percentile_entropy.append(percentile_entropy(trans))

wa_line_percentile_entropy = []
for trans in wa_line_trans:
    wa_line_percentile_entropy.append(percentile_entropy(trans))

lyric_line_percentile_entropy = []
for trans in lyric_line_trans:
Exemplo n.º 27
0
import pandas as pd
import pickle
from preprocess import read_data

# To add:
# - average length of tweets
# - # of tweets containing emotions in language
# - # of tweets containing emojis


data = pickle.load(open('data/data_df.pickle', 'rb'))

months = {2:'february', 3:'march', 4:'april', 5:'may', 6:'june', 7:'july'}
f = open('data/short_overview.txt','w')

orig_data = read_data('data/40wita')
total_after_clean = 0
total_before_clean = 0
for month in months.keys():
    tweets = len(data[data['month']==month]['cleaned_text'])
    total_after_clean += tweets
    orig_tweets = len(orig_data[orig_data['month']==month]['text'])
    total_before_clean += orig_tweets
    max_tweets = 0
    min_tweets = 2000000
    orig_days = len(sorted(orig_data[orig_data['month']==month].day.unique()))
    days = sorted(data[data['month']==month].day.unique())

    for day in days:
        twts = len(data[(data['month']==month) & (data['day']==day)]['cleaned_text'])
        if twts > max_tweets:
Exemplo n.º 28
0
    return list(map(int, processed.keys()))


def update_processed(k, v, tested_file):
    if not os.path.isfile(tested_file):
        current = {}
    else:
        with open(tested_file) as f:
            current = json.load(f)
    current[k] = v
    with open(tested_file, "w") as f:
        json.dump(current, f)


if __name__ == "__main__":
    dataset, labels, filenames = read_data("data")

    # define some files that will be used as a support
    experiment = "experiment_name"  # experiment name (unique Id)
    tested_file = f"tested_{experiment}.json"  # this file will contain the results of the experiments
    confs_file = f"allconfs_{experiment}.pkl"  # this file contains a list of all parameter configurations to be tested

    if not os.path.isfile(confs_file):
        # if there is no configuration file containing a list of all
        # parameters to be tried, a new one is generated
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

        # performing both k-fold cross-validation and hold out
        folds = [ ("kfold", v) for v in list(skf.split(dataset, labels)) ]+ \
                [ ("tt", train_test_split(list(range(len(labels))), train_size=300, shuffle=False)) ]
Exemplo n.º 29
0
c_range = [round(0.1 * a, 1)
           for a in range(1, 10)] + [1] + list(range(10, 151, 10))
kernels = ['linear', 'rbf', 'sigmoid']  ###poly太花时间了,暂且去掉
############################################################################################
########################^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#################################
########################  take acc-line-trans as feature   #################################
########################^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#################################
############################################################################################
wiki_acc_src = '/home/liruihan/Desktop/data/wiki_entropy/wiki_accumulated_line_trans/'
lyric_acc_src = '/home/liruihan/Desktop/data/uni_accumulated_entropy_trans/'
wa_acc_src = '/home/liruihan/Desktop/data/wa_entropy/wa_accumulated_line_trans/'
cm_acc_src = '/home/liruihan/Desktop/data/cm_entropy_20190115/cm_accumulated_line_trans/'
aozora_acc_src = '/home/liruihan/Desktop/data/aozora_entropy/aozora_accumulated_line_trans/'

wiki_acc_line_trans = [e.split('\n')[:-1] for e in pre.read_data(wiki_acc_src)]
lyric_acc_line_trans = [
    e.split('\n')[:-1] for e in pre.read_data(lyric_acc_src, 'num_')
    if e != '0\n'
]
wa_acc_line_trans = [e.split('\n')[:-1] for e in pre.read_data(wa_acc_src)]
cm_acc_line_trans = [e.split('\n')[:-1] for e in pre.read_data(cm_acc_src)]
aozora_acc_line_trans = [
    e.split('\n')[:-1] for e in pre.read_data(aozora_acc_src)
]

wiki_percentile_entropy = []
for trans in wiki_acc_line_trans:
    wiki_percentile_entropy.append(percentile_entropy(trans))

wa_percentile_entropy = []
Exemplo n.º 30
0
import numpy as np
from keras.models import Sequential
from keras.layers import BatchNormalization, Convolution1D, Dropout, Flatten, Dense, Convolution2D
from preprocess import read_data

#Loading the data
filename = "driving_log.csv"

#Y_train is the angle of the camera
X_train, y_train = read_data(filename,
                             pre_process=True,
                             flip=True,
                             dropSmallValuesWithRate=50)

#My model
model = Sequential()

print("The shape of the model is: " + str(X_train[0].shape))


def train_model(X_train, y_train):
    if len(X_train[0].shape) == 2:
        print("Using two dimensional network")
        model.add(
            BatchNormalization(input_shape=(X_train[0].shape[0],
                                            X_train[0].shape[1])))
        model.add(Convolution1D(5, 5))
        model.add(Convolution1D(5, 5))
        model.add(Convolution1D(3, 3))
        model.add(Convolution1D(3, 3))
    else: