예제 #1
0
def load_dataset(test_size=0.2):
    ## load word vector
    with utils.timer('Load word vector'):
        word2vec = tl.files.load_npy_to_any(name='%s/word2vec/w2v_sgns_%s_%s_%s.npy' % (config.ModelOutputDir, config.embedding_size, config.corpus_version, datestr))
    ## load train data
    with utils.timer('Load train data'):
        data_1 = utils.load_cs_deleted_data(cs_delete_file_1)
        print('target ratio: ')
        print(data_1['label'].value_counts())
        data_2 = utils.load_58_data(pos_58_file)
        print(data_2['label'].value_counts())
        data_3 = utils.load_58_data(neg_58_file)
        print(data_3['label'].value_counts())
        data_4 = utils.load_cs_deleted_data(cs_delete_file_2)
        print(data_4['label'].value_counts())
        data = pd.concat([data_1, data_2, data_3, data_4[data_4['label'] == 1].reset_index(drop=True)], axis=0,ignore_index=True)
        #data = pd.concat([data_1, data_2, data_3, data_4], axis=0,ignore_index=True)
        DebugDir = '%s/debug' % config.DataBaseDir
        if (os.path.exists(DebugDir) == False):
            os.makedirs(DebugDir)
        del data_4, data_3, data_2, data_1
        gc.collect()
    ## data representation
    with utils.timer('representation for train'):
        # X = [[word2vec.get(w, word2vec['_UNK']) for w in utils.cut(text)] for text in data['text'].values]
        X = []
        y = []
        for i in range(len(data)):
            text = data['text'][i]
            if(text == ''):
                continue
            words = utils.cut(text)
            if(len(words) == 0):
                continue
            X.append([word2vec.get(w, word2vec['_UNK']) for w in words])
            y.append(data['label'][i])

    del word2vec, data
    gc.collect()

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size)

    return X_train, y_train, X_valid, y_valid
예제 #2
0
def LoadTrainData():
    ''''''
    with utils.timer('Load train data'):
        data_1 = utils.load_cs_deleted_data(cs_delete_file_1)
        print('target ratio: ')
        print(data_1['label'].value_counts())
        data_2 = utils.load_58_data(pos_58_file)
        print(data_2['label'].value_counts())
        data_3 = utils.load_58_data(neg_58_file)
        print(data_3['label'].value_counts())
        data_4 = utils.load_cs_deleted_data(cs_delete_file_2)
        print(data_4['label'].value_counts())
        data = pd.concat([data_1, data_2, data_3, data_4],
                         axis=0,
                         ignore_index=True)
        del data_4, data_3, data_2, data_1
        #data = data[:int(0.6 * len(data))]
        gc.collect()
    return data
예제 #3
0
파일: lr.py 프로젝트: RankingAI/text-mining
    'subsample': .9,
}

if __name__ == '__main__':
    ''''''
    ## load word2vec lookup table
    with utils.timer('Load word vector'):
        word2vec = tl.files.load_npy_to_any(
            name='%s/model/word2vec_post_text_3d.npy' % config.DataBaseDir)

    ## load data
    with utils.timer('Load data'):
        data_1 = utils.load_cs_deleted_data(cs_delete_file)
        print('target ratio: ')
        print(data_1['label'].value_counts())
        data_2 = utils.load_58_data(pos_58_file)
        print(data_2['label'].value_counts())
        data_3 = utils.load_58_data(neg_58_file)
        print(data_3['label'].value_counts())
        data = pd.concat([data_1, data_2, data_3], axis=0, ignore_index=True)
        DebugDir = '%s/debug' % config.DataBaseDir
        if (os.path.exists(DebugDir) == False):
            os.makedirs(DebugDir)
        writer = pd.ExcelWriter('%s/raw.xlsx' % DebugDir)
        data.to_excel(writer, index=False)
        writer.close()
        del data_3, data_2, data_1
        gc.collect()

    ## representation
    hit_words = []
예제 #4
0
def train_test_and_save_model():
    ## load data
    with utils.timer('Load data'):
        data_1 = utils.load_cs_deleted_data(cs_delete_file)
        print('target ratio: ')
        print(data_1['label'].value_counts())
        data_2 = utils.load_58_data(pos_58_file)
        print(data_2['label'].value_counts())
        data_3 = utils.load_58_data(neg_58_file)
        print(data_3['label'].value_counts())
        data = pd.concat([data_1, data_2, data_3], axis= 0, ignore_index= True)
        DebugDir = '%s/debug' % config.DataBaseDir
        if(os.path.exists(DebugDir) == False):
            os.makedirs(DebugDir)
        #writer = pd.ExcelWriter('%s/raw.xlsx' % DebugDir)
        #data.to_excel(writer, index= False)
        #writer.close()
        del data_3, data_2, data_1
        gc.collect()

    X_raw_words = data['text'].apply(utils.cut)
    uni_words = list(set([w for rec in X_raw_words for w in rec]))
    word_dict = dict(zip(uni_words, range(len(uni_words))))
    X_words = []
    for rec in X_raw_words:
        new_rec = []
        for w in rec:
            new_rec.append(word_dict[w])
        X_words.append(new_rec)
    # X_words = np.array(X_words)
    y = np.array(data['label'])
    if N_GRAM is not None:
        X_words = np.array([augment_with_ngrams(x, VOCAB_SIZE, N_BUCKETS, n= N_GRAM) for x in X_words])

    print(X_words.shape)
    print(y.shape)
    print(X_words[:5])
    print(y[:5])

    final_train_pred = np.zeros(len(X_words))
    for s in range(config.train_times):
        s_start = time.time()
        train_pred = np.zeros(len(X_words))

        classifier = FastTextClassifier(
            vocab_size=VOCAB_SIZE + N_BUCKETS,
            embedding_size=EMBEDDING_SIZE,
            n_labels=2,
        )

        skf = StratifiedKFold(config.kfold, random_state=2018 * s, shuffle=False)

        for fold, (train_index, valid_index) in enumerate(skf.split(X_words, y)):
            X_train, X_valid = X_words[train_index], X_words[valid_index]
            y_train, y_valid = y[train_index], y[valid_index]

            with tf.Session() as sess:
                sess.run(tf.local_variables_initializer())
                tl.layers.initialize_global_variables(sess)

                for epoch in range(N_EPOCH):
                    start_time = time.time()
                    print('Epoch %d/%d' % (epoch + 1, N_EPOCH))
                    for X_batch, y_batch in tl.iterate.minibatches(X_train, y_train, batch_size=BATCH_SIZE, shuffle=True):
                        sess.run(
                            classifier.train_op, feed_dict={
                                classifier.inputs: tl.prepro.pad_sequences(X_batch),
                                classifier.labels: y_batch,
                            }
                        )

                    valid_pred_proba = sess.run(
                        classifier.prediction_probs, feed_dict={
                            classifier.inputs: tl.prepro.pad_sequences(X_valid)
                        }
                    )[:,1]
                    valid_pred_label = utils.proba2label(valid_pred_proba)
                    valid_auc = roc_auc_score(y_valid, valid_pred_proba)
                    valid_precision = precision_score(y_valid, valid_pred_label)
                    valid_recall = recall_score(y_valid, valid_pred_label)
                    if(epoch == N_EPOCH - 1):
                        train_pred[valid_index] = valid_pred_proba

                    # valid_precision = sess.run(
                    #     classifier.precision, feed_dict={
                    #         classifier.inputs: tl.prepro.pad_sequences(X_valid),
                    #         classifier.labels: y_valid,
                    #     }
                    # )
                    # valid_recall = sess.run(
                    #     classifier.recall, feed_dict={
                    #         classifier.inputs: tl.prepro.pad_sequences(X_valid),
                    #         classifier.labels: y_valid,
                    #     }
                    # )
                    print('valid: auc %.6f, precision %.6f, recall %.6f, took %s[s]' % (valid_auc, valid_precision, valid_recall, int(time.time() - start_time)))
                classifier.save(sess, MODEL_FILE_PATH)
            print('fold %s done!!!' % fold)
        auc = roc_auc_score(y, train_pred)
        precision = precision_score(y, utils.proba2label(train_pred))
        recall = recall_score(y, utils.proba2label(train_pred))
        print('auc %.6f, precision %.6f, recall %.6f, took %s[s]' % (auc, precision, recall, int(time.time() - s_start)))