예제 #1
0
def get_language():
    "检测语言是否为中文"
    from Ref_Data import replace_word
    import json
    from langdetect import detect_langs
    from langdetect.lang_detect_exception import LangDetectException
    train = input.read_dataset('train.csv').fillna(replace_word['unknow'])
    test  = input.read_dataset('test.csv').fillna(replace_word['unknow'])

    records = {}

    for index, row in tqdm(train.iterrows()):
        try:
            lang_prob = detect_langs(row['comment_text'])
            language = lang_prob[0].lang
            if language != 'en':
                records['tr' + str(index)] = (row['comment_text'], language, lang_prob[0].prob)
        except LangDetectException:
            records['tr' + str(index)] = (row['comment_text'], 'none',0)

    for index, row in tqdm(test.iterrows()):
        try:
            lang_prob = detect_langs(row['comment_text'])
            language = lang_prob[0].lang
            if language != 'en':
                records['te' + str(index)] = (row['comment_text'], language, lang_prob[0].prob)
        except LangDetectException:
            records['te' + str(index)] = (row['comment_text'], 'none',0)
    records = sorted(records.items(), key=lambda item: item[1][2], reverse=True)
    with open('language_record.json', 'w') as f:
        f.write(json.dumps(records, indent=4, separators=(',', ': '),ensure_ascii=False))
예제 #2
0
def tfidfFeature(n_components=CHAR_N):
    ''' TF-IDF Vectorizer '''
    train = input.read_dataset('clean_train.csv')
    test = input.read_dataset('clean_test.csv')
    train['comment_text'] = train['comment_text'].fillna(
        replace_word['unknow'])
    test['comment_text'] = test['comment_text'].fillna(replace_word['unknow'])
    text = train['comment_text'].values.tolist(
    ) + test['comment_text'].values.tolist()

    def pca_compression(model_tfidf, n_components):
        np_model_tfidf = model_tfidf.toarray()
        pca = PCA(n_components=n_components)
        pca_model_tfidf = pca.fit_transform(np_model_tfidf)
        return pca_model_tfidf

    tfv = TfidfVectorizer(min_df=100,
                          max_features=30000,
                          strip_accents='unicode',
                          analyzer='char',
                          ngram_range=(2, 4),
                          use_idf=1,
                          smooth_idf=True,
                          sublinear_tf=True)
    model_tfidf = tfv.fit_transform(text)

    # 获取pca后的np
    pca_model_tfidf = pca_compression(model_tfidf, n_components=n_components)
    # 获取添加特征名后的pd
    print(pca_model_tfidf.shape)
    cols = ["tfidf" + str(x) for x in range(n_components)]
    pca_model_tfidf = pd.DataFrame(pca_model_tfidf, columns=cols)

    for col in cols:
        pca_model_tfidf[col] = \
            (pca_model_tfidf[col]-pca_model_tfidf[col].mean())/pca_model_tfidf[col].std()
        list_col = pca_model_tfidf[col].tolist()
        train[col] = list_col[:len(train)]
        test[col] = list_col[len(train):]

    print('save')
    train.to_csv(PATH + 'clean_train.csv', index=False)
    test.to_csv(PATH + 'clean_test.csv', index=False)
예제 #3
0
def add_comment(index,file):
    import input
    if file == 'te':
        dataset = input.read_dataset('test.csv')
    else:
        dataset = input.read_dataset('train.csv')
    with open('language_record.json') as f:
        comments = json.loads(f.read())
    for i in index:
        comment = [
            file+str(i),
            [
                dataset.loc[i,'comment_text'],
                "add",
                1
            ]
        ]
        comments.append(comment)
    with open('language_record.json', 'w') as f:
        f.write(json.dumps(comments, indent=4, separators=(',', ': '),ensure_ascii=False))
예제 #4
0
def generator_char_vec(wordvecfile='crawl'):
    embeddings_index = input.read_wordvec(wordvecfile)

    words = embeddings_index.keys()
    chars = []
    for w in words:
        chars.append(char_analyzer(w))
    del embeddings_index

    train = input.read_dataset('clean_train.csv')
    test = input.read_dataset('clean_test.csv')
    text = train['char_text'].tolist()
    text += test['char_text'].tolist()

    import itertools, json
    corpus_chars = list(itertools.chain.from_iterable(text))  # 2维list展开成1维
    corpus_chars += chars
    idx_to_char = list(set(corpus_chars))
    char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
    with open(PATH + 'char2index.json', 'w') as f:
        f.write(json.dumps(char_to_idx, indent=4, separators=(',', ': ')))
예제 #5
0
def char2idx(wordvecfile):
    train = input.read_dataset('clean_train.csv')
    test = input.read_dataset('clean_test.csv')
    train['comment_text'] = train['comment_text'].fillna(
        replace_word['unknow'])
    test['comment_text'] = test['comment_text'].fillna(replace_word['unknow'])
    text = train['comment_text'].values.tolist(
    ) + test['comment_text'].values.tolist()
    text = tokenize_word(text)
    input.read_wordvec(wordvecfile)

    def get_ch_seqs(text):
        results = []
        pool = mlp.Pool(mlp.cpu_count())

        comments = list(text)
        aver_t = int(len(text) / mlp.cpu_count()) + 1
        for i in range(mlp.cpu_count()):
            result = pool.apply_async(batch_char_analyzer,
                                      args=(comments[i * aver_t:(i + 1) *
                                                     aver_t], True))
            results.append(result)
        pool.close()
        pool.join()

        ch_seqs = []
        for result in results:
            char_seq = result.get()
            ch_seqs.extend(char_seq)

        return ch_seqs

    import itertools, json
    corpus_chars = list(
        itertools.chain.from_iterable(corpus_chars))  #2维list展开成1维
    idx_to_char = list(set(corpus_chars))
    char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
    with open(PATH + 'char2index.json', 'w') as f:
        f.write(json.dumps(char_to_idx, indent=4, separators=(',', ': ')))
예제 #6
0
def get_char_text():
    train = input.read_dataset('clean_train.csv')
    test = input.read_dataset('clean_test.csv')
    train['comment_text'] = train['comment_text'].fillna(
        replace_word['unknow'])
    test['comment_text'] = test['comment_text'].fillna(replace_word['unknow'])
    text = train['comment_text'].values.tolist(
    ) + test['comment_text'].values.tolist()
    text = tokenize_word(text)

    def get_ch_seqs(text):
        results = []
        pool = mlp.Pool(mlp.cpu_count())

        comments = list(text)
        aver_t = int(len(text) / mlp.cpu_count()) + 1
        for i in range(mlp.cpu_count()):
            result = pool.apply_async(batch_char_analyzer,
                                      args=(comments[i * aver_t:(i + 1) *
                                                     aver_t], True))
            results.append(result)
        pool.close()
        pool.join()

        ch_seqs = []
        for result in results:
            char_seq = result.get()
            ch_seqs.extend(char_seq)

        return ch_seqs

    seqs = get_ch_seqs(text)
    train['char_text'] = seqs[:len(train)]
    test['char_text'] = seqs[len(train):]
    train.to_csv(PATH + 'clean_train.csv', index=False)
    test.to_csv(PATH + 'clean_test.csv', index=False)
예제 #7
0
def pipeline(
        file=(
            'train.csv',
            'test.csv',
            # 'train_fr.csv','train_es.csv','train_de.csv'
        )):
    for filename in tqdm(file):
        dataset = input.read_dataset(filename)
        # dataset = translation_sub(dataset,filename[:2])
        dataset.fillna(replace_word['unknow'], inplace=True)
        dataset = createFeature.countFeature(dataset)
        clean_dataset(dataset, 'clean_' + filename)

    createFeature.get_char_text()
    from ConvAIData import get_label_feature
    get_label_feature()
예제 #8
0
def get_other_language_train(lang='nl'):
    import multiprocessing as mlp
    from Ref_Data import replace_word,PATH

    dataset = input.read_dataset('train.csv')
    dataset.fillna(replace_word['unknow'], inplace=True)
    comments = dataset['comment_text'].tolist()

    results = []
    pool = mlp.Pool(mlp.cpu_count())
    aver_t = int(len(comments) / mlp.cpu_count()) + 1
    for i in range(mlp.cpu_count()):
        result = pool.apply_async(get_other_lang_train,
                                  args=(comments[i * aver_t:(i + 1) * aver_t],lang))
        results.append(result)
    pool.close()
    pool.join()

    translation = []
    for result in results:
        translation.extend(result.get())
    dataset['comment_text'] = translation
    dataset.to_csv(PATH+lang+'_train.csv',index=False)
예제 #9
0
def train_and_evaluate(args):
    """Train and evaluate custom Estimator with three training modes.

  Given the dictionary of parameters, create custom Estimator and run up to
  three training modes then return Estimator object.

  Args:
    args: Dictionary of parameters.

  Returns:
    Estimator object.
  """
    # Create our custom estimator using our model function
    estimator = tf.estimator.Estimator(
        model_fn=anomaly_detection,
        model_dir=args["output_dir"],
        params={key: val
                for key, val in args.items()})

    if args["training_mode"] == "reconstruction":
        if args["model_type"] == "pca":
            estimator.train(input_fn=read_dataset(
                filename=args["train_file_pattern"],
                mode=tf.estimator.ModeKeys.EVAL,
                batch_size=args["train_batch_size"],
                params=args),
                            steps=None)
        else:  # dense_autoencoder or lstm_enc_dec_autoencoder
            # Create early stopping hook to help reduce overfitting
            early_stopping_hook = tf.contrib.estimator.stop_if_no_decrease_hook(
                estimator=estimator,
                metric_name="rmse",
                max_steps_without_decrease=100,
                min_steps=1000,
                run_every_secs=60,
                run_every_steps=None)

            # Create train spec to read in our training data
            train_spec = tf.estimator.TrainSpec(input_fn=read_dataset(
                filename=args["train_file_pattern"],
                mode=tf.estimator.ModeKeys.TRAIN,
                batch_size=args["train_batch_size"],
                params=args),
                                                max_steps=args["train_steps"],
                                                hooks=[early_stopping_hook])

            # Create eval spec to read in our validation data and export our model
            eval_spec = tf.estimator.EvalSpec(
                input_fn=read_dataset(filename=args["eval_file_pattern"],
                                      mode=tf.estimator.ModeKeys.EVAL,
                                      batch_size=args["eval_batch_size"],
                                      params=args),
                steps=None,
                start_delay_secs=args[
                    "start_delay_secs"],  # start eval after N secs
                throttle_secs=args["throttle_secs"])  # evaluate every N secs

            # Create train and evaluate loop to train and evaluate our estimator
            tf.estimator.train_and_evaluate(estimator=estimator,
                                            train_spec=train_spec,
                                            eval_spec=eval_spec)
    else:
        # if args["training_mode"] == "calculate_error_distribution_statistics"
        # Get final mahalanobis statistics over the entire val_1 dataset

        # if args["training_mode"] == "tune_anomaly_thresholds"
        # Tune anomaly thresholds using val_2 and val_anom datasets
        train_spec = tf.estimator.TrainSpec(
            input_fn=read_dataset(
                filename=args["train_file_pattern"],
                mode=tf.estimator.ModeKeys.EVAL,  # read through val data once
                batch_size=args["train_batch_size"],
                params=args),
            max_steps=args["train_steps"])

        if args["training_mode"] == "calculate_error_distribution_statistics":
            # Evaluate until the end of eval files
            eval_steps = None

            # Don't create exporter for serving yet since anomaly thresholds
            # aren't trained yet
            exporter = None
        elif args["training_mode"] == "tune_anomaly_thresholds":
            if args["labeled_tune_thresh"]:
                # Evaluate until the end of eval files
                eval_steps = None
            else:
                # Don't evaluate
                eval_steps = 0

            # Create exporter that uses serving_input_fn to create saved_model
            # for serving
            exporter = tf.estimator.LatestExporter(
                name="exporter",
                serving_input_receiver_fn=lambda: serving_input_fn(args[
                    "seq_len"]))
        else:
            print("{0} isn't a valid training mode!".format(
                args["training_mode"]))

        # Create eval spec to read in our validation data and export our model
        eval_spec = tf.estimator.EvalSpec(
            input_fn=read_dataset(filename=args["eval_file_pattern"],
                                  mode=tf.estimator.ModeKeys.EVAL,
                                  batch_size=args["eval_batch_size"],
                                  params=args),
            steps=eval_steps,
            exporters=exporter,
            start_delay_secs=args[
                "start_delay_secs"],  # start eval after N secs
            throttle_secs=args["throttle_secs"])  # evaluate every N secs

    if (args["training_mode"] == "calculate_error_distribution_statistics"
            or args["training_mode"] == "tune_anomaly_thresholds"):
        # Create train and evaluate loop to train and evaluate our estimator
        tf.estimator.train_and_evaluate(estimator=estimator,
                                        train_spec=train_spec,
                                        eval_spec=eval_spec)
예제 #10
0
def _try_params(n_iterations, batch_size, fun_shape, em_shape, db_path, lr, optimizer, scheduler, net3, tl,counter):
    "try some parameters, report testing accuracy with square loss"
    # read data
    x_train, y_train, x_test, y_test = read_dataset(db_path, batch_size)
    # initialize training/testing graph
    # initialize session
    sess = tf.Session()
    coord = tf.train.Coordinator()
    _, yhat_train, X = eval(net3)(X=x_train, fun_shape=fun_shape, em_shape=em_shape, sess=sess, coord=coord, tl=tl)
    #find accuracy om train data
    y_ = tf.expand_dims(y_train, 1)
    y__=y_
    for i in range(em_shape[0]-1):

        y__=tf.concat([y__,y_],axis=1)
    yhat_predicted = tf.nn.softmax(yhat_train)

    correct_prediction = tf.equal(tf.argmax(yhat_predicted, 2), tf.argmax(y__, 2))
    acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    #find loss
    train_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=yhat_train, labels=y__)

    #find accuracy on test data
    l0 = tf.expand_dims(x_test, 1)
    W1 = X[0]
    l1 = tf.reduce_sum(tf.expand_dims(l0, 3) * W1, axis=2)
    l1_act = tf.nn.relu(l1)
    W2 = X[1]
    l2 = tf.reduce_sum(tf.expand_dims(l1_act, 3) * W2, axis=2)
    l2_act = tf.nn.relu(l2)
    W3 = X[2]
    yhat_train = tf.reduce_sum(tf.expand_dims(l2_act, 3) * W3, axis=2)
    y_ = tf.expand_dims(y_test, 1)
    y__=y_

    for i in range(em_shape[0]-1):

        y__=tf.concat([y__,y_],axis=1)

    yhat=tf.nn.softmax(yhat_train)

    correct_test_prediction = tf.equal(tf.argmax(yhat, 2), tf.argmax(y__, 2))
    test_acc_ = tf.reduce_mean(tf.cast(correct_test_prediction, tf.float32))

    lr_current = tf.placeholder(tf.float32)
    train_step = eval(optimizer)(learning_rate=lr_current).minimize(train_loss)

    sess.run(tf.global_variables_initializer())
    tf.train.start_queue_runners(sess, coord)
    _train_losses = []
    accuracy_container = []
    test_acc_container = []

    mean_accuracy = 0

    for i in range(n_iterations):

        start = time.time()

        _train_loss, accuracy ,t_accuracy,_ = sess.run([train_loss, acc,test_acc_, train_step], feed_dict={lr_current: lr})
        mean_accuracy += accuracy

        _train_losses.append(_train_loss)
        accuracy_container.append(accuracy)
        test_acc_container.append(t_accuracy)

        if scheduler == "none":
            pass
        elif scheduler == "dlp":
            # scheduler the step size
            if i % 2000 == 1999:
                if np.mean(_train_losses[-1000:]) >= np.mean(_train_losses[-2000:-1000]):
                    lr = lr * 0.5
        else:
            machine = socket.gethostname()
            if machine != "viacheslav-HP-Pavilion-Notebook":
                raise ValueError("unknown scheduler")

            else:
                pass
            if i % 100 == 0:
                # print "argmin of the train loss", _y_diff[_train_loss.argmin()],
                print("step:", i, "mean_loss", np.mean(_train_loss), "min_loss", np.min(_train_loss), )
                # print "y_train:", np.mean(_y_train), np.var(_y_train),_y_train.shape,
                # print "y_hat:", np.mean(_yhat_train),np.var(_yhat_train), _yhat_train.shape,
                print("mean accuracy=", mean_accuracy, "%")
                print("lr", lr)
                print("exps:", cfg.batch_size / (time.time() - start))
                mean_accuracy = 0

        # history
        if i % 1000 == 1:
            _train_loss, = sess.run([train_loss], feed_dict={lr_current: lr})

    sess.close()
    tf.reset_default_graph()
    np.savetxt(os.path.join(cfg.out_path, config_name + "_aam_train_accuracy_" + str(lr)+str(counter)), accuracy_container)
    np.savetxt(os.path.join(cfg.out_path, config_name + "_aam_test_accuracy_" + str(lr) + str(counter)),test_acc_container)

    sel_point = np.mean(_train_losses[-200:-100], axis=0).argmin()
    minmean_loss = np.mean(_train_losses[:-100], axis=0)[sel_point]
    loss_hist = np.asarray(_train_losses)[:, sel_point]

    return minmean_loss, loss_hist
예제 #11
0
def save_result(test_predicts,outputfile):
    list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    sample_submission = input.read_dataset('sample_submission.csv')
    sample_submission[list_classes] = test_predicts
    sample_submission.to_csv(outputfile, index=False, compression='gzip')
예제 #12
0
def splitTarget(filename):
    list_classes = [
        "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
    ]
    labels = input.read_dataset(filename, list_classes)
    labels.to_csv(PATH + 'labels.csv', index=False)
예제 #13
0
def Boost(X,Y,test,para,setting,outputfile='boost.csv.gz',cal_weight = False):

    def _train_model(model, model_name,train_x, train_y,val_x, val_y,
                     test,batchsize=BATCHSIZE, frequecy=50,init=30):
        from sklearn.metrics import roc_auc_score

        generator = tool.Generate(train_x, train_y, batchsize=frequecy * batchsize)

        epoch = 1
        best_epoch = 1
        best_score = -1

        while True:

            samples_x, samples_y = generator.genrerate_samples()
            model.fit(samples_x, samples_y, batch_size=batchsize, epochs=1, verbose=0)

            if epoch >= init:
                # evaulate
                y_pred = model.predict(val_x, batch_size=2048, verbose=0)
                Scores = []
                for i in range(6):
                    score = roc_auc_score(val_y[:, i], y_pred[:, i])
                    Scores.append(score)
                cur_score = np.mean(Scores)
                print(cur_score)
                print(Scores)

                if epoch == init or best_score < cur_score:
                    best_score = cur_score
                    best_epoch = epoch
                    print(best_score, best_epoch, '\n')
                    result = y_pred
                    model.save_weights(WEIGHT_FILE + model_name)
                elif epoch - best_epoch > 12:  # patience 为5
                    model.load_weights(WEIGHT_FILE + model_name, by_name=False)
                    test_pred = model.predict(test, batch_size=2048)
                    return test_pred, result
            epoch += 1

    def get_model(model_para):
        m = nnBlock.model(
            embedding_matrix=model_para['embedding_matrix'],
            trainable=model_para['trainable'],
            load_weight=model_para['load_weight'],
            loss=model_para['loss'],
            boost=model_para['boost'],
        )
        m.get_layer(model_para['modelname'])
        return m

    def cv(model_para, X, Y, test, K=5,init=30,sample_weight=None):


        kf = KFold(len(Y), n_folds=K, shuffle=False)

        results = []
        train_score = np.zeros((len(Y),6))

        for i, (train_index, valid_index) in enumerate(kf):
            print('第{}次训练...'.format(i))
            trainset = tool.splitdata(train_index, X)
            label_train = Y[train_index]

            validset = tool.splitdata(valid_index, X)
            label_valid = Y[valid_index]

            model = get_model(model_para)
            test_pred, val_score = _train_model(model,
                                model_para['modelname'] + "_" + str(i) + ".h5",
                                trainset, label_train,
                                validset, label_valid, test,init=init)

            train_score[valid_index] = val_score
            results.append(test_pred)

        test_predicts = tool.cal_mean(results, None)
        return train_score,test_predicts

    train_score,test_score = cv(para,X,Y,test,init=30)
    para['boost'] = True
    if cal_weight:
        para['sample_weight'] = np.ones(len(Y))
    for loss,model_name in setting:
        X['boost'] = train_score
        test['boost'] = test_score
        para['loss'] = loss
        para['modelname'] = model_name
        if cal_weight:
            para['sample_weight'] = get_weight(train_score,Y,para['sample_weight'])
        train_score,test_score = cv(para, X, Y, test,init=5)


    list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    sample_submission = input.read_dataset('sample_submission.csv')
    sample_submission[list_classes] = test_score
    sample_submission.to_csv(outputfile, index=False, compression='gzip')
예제 #14
0
def createKmeansFeature(usecols, name, k=6):
    from sklearn.cluster import KMeans
    train = input.read_dataset('clean_train.csv')
    test = input.read_dataset('clean_test.csv')
    data = train.append(test)[usecols].values

    # def distMeas(vecA, vecB):
    #     return np.sqrt(np.sum(np.power(vecA - vecB, 2), axis=1))
    #
    # def KMeans(dataSet, k):
    #     """
    #     k-means 聚类算法
    #     该算法会创建k个质心,然后将每个点分配到最近的质心,再重新计算质心。这个过程重复数次,直到数据点的簇分配结果不再改变为止。
    #     """
    #     def createRandCent(dataSet, k):
    #         """
    #         为给定数据集构建一个包含k个随机质心的集合。
    #         """
    #         n = dataSet.shape[1]  # 列的数量
    #         feature_min = dataSet.min(axis=0)  # 获取每个特征的下界
    #         feature_range = dataSet.max(axis=0) - feature_min
    #         centroids = feature_min + feature_range * np.random.random((k, n))
    #         return centroids
    #
    #     m = dataSet.shape[0]  # 行数
    #     clusterAssment = np.zeros(m)  # 创建一个与 dataSet 行数一样,但是有两列的矩阵,用来保存簇分配结果(一列簇索引值、一列误差)
    #     centroids = createRandCent(dataSet, k)  # 创建质心,随机k个质心
    #     distance = np.zeros((m, k))
    #     clusterChanged = True
    #     while clusterChanged:
    #         for j in range(k):
    #             distance[:, j] = distMeas(centroids[j, :], dataSet)
    #
    #         sample_cluster = distance.argmin(axis=1)  # 获取所属的簇
    #         num_change = np.sum(clusterAssment != sample_cluster)  # 有多少样本所属簇变了
    #         if num_change == 0:
    #             clusterChanged = False
    #         clusterAssment = sample_cluster
    #
    #         for center in range(k):  # 更新质心的位置
    #             ptsInClust = dataSet[clusterAssment == center]  # 获取该簇中的所有点
    #             centroids[center, :] = np.mean(ptsInClust, axis=0)
    #         # 处理nan
    #         centroids = np.nan_to_num(centroids)
    #     return centroids

    # samples = data[usecols].values
    # centroids = KMeans(samples ,k)       # kMeans聚类

    # for j in range(k):  # k为质心数
    #     data["kmeans" + str(j + 1)] = \
    #         distMeas(centroids[j, :], samples)  # 计算数据点到各个质心的距离

    model = KMeans(6, max_iter=3000, tol=1e-6, n_jobs=-1)
    features = model.fit_transform(data)
    for i in range(k):
        train[name + '_kmean_' + str(i)] = features[:len(train), i]
        test[name + '_kmean_' + str(i)] = features[len(train):, i]

    train.to_csv(PATH + 'clean_train.csv', index=False)
    test.to_csv(PATH + 'clean_test.csv', index=False)
예제 #15
0
def get_pos_tag_vec():
    from nltk import pos_tag
    train = input.read_dataset('clean_train.csv')
    test = input.read_dataset('clean_test.csv')
    train['comment_text'] = train['comment_text'].fillna(
        replace_word['unknow'])
    test['comment_text'] = test['comment_text'].fillna(replace_word['unknow'])
    text = train['comment_text'].values.tolist(
    ) + test['comment_text'].values.tolist()
    text = tokenize_word(text)

    def get_tag_text(text):
        results = []
        pool = mlp.Pool(mlp.cpu_count())

        comments = list(text)
        aver_t = int(len(text) / mlp.cpu_count()) + 1
        for i in range(mlp.cpu_count()):
            result = pool.apply_async(get_tag,
                                      args=(comments[i * aver_t:(i + 1) *
                                                     aver_t], pos_tag))
            results.append(result)
        pool.close()
        pool.join()

        text_tag = []
        word2tag = {}
        for result in results:
            t_tag, word_2_vec = result.get()
            text_tag.extend(t_tag)
            word2tag.update(word_2_vec)
        return text_tag, word2tag

    def getTfidfVector(clean_corpus,
                       min_df=0,
                       max_features=int(1e10),
                       ngram_range=(1, 1),
                       use_idf=False,
                       sublinear_tf=True):
        def tokenizer(t):
            return t.split()

        tfv = TfidfVectorizer(min_df=min_df,
                              max_features=max_features,
                              tokenizer=tokenizer,
                              strip_accents=None,
                              analyzer="word",
                              ngram_range=ngram_range,
                              use_idf=use_idf,
                              sublinear_tf=sublinear_tf)
        tag_tfidf = tfv.fit_transform(clean_corpus)
        return tag_tfidf, list(tfv.get_feature_names())

    text_tag, word2tag = get_tag_text(text)
    import json

    with open(PATH + 'word2tag.json', 'w') as f:
        f.write(json.dumps(word2tag, indent=4, separators=(',', ': ')))

    tag_tfidf, columns = getTfidfVector(text_tag)
    n_components = POSTAG_DIM  # 输出pca.lambda_ 选择99%的成分即可
    pca = KernelPCA(n_components=n_components, kernel='rbf', n_jobs=-1)
    pca_tfidf = pca.fit_transform(tag_tfidf.transpose()).transpose()

    postag_vec = pd.DataFrame(pca_tfidf, columns=columns)
    postag_vec.to_csv(PATH + 'postagVec.csv', index=False)
예제 #16
0
def LDAFeature(num_topics=NUM_TOPIC):
    from gensim.corpora import Dictionary
    from gensim.models.ldamulticore import LdaMulticore

    def get_corpus(dictionary, text):
        results = []
        pool = mlp.Pool(mlp.cpu_count())

        comments = list(text)
        aver_t = int(len(text) / mlp.cpu_count()) + 1
        for i in range(mlp.cpu_count()):
            result = pool.apply_async(doc2bow,
                                      args=(comments[i * aver_t:(i + 1) *
                                                     aver_t], dictionary))
            results.append(result)
        pool.close()
        pool.join()

        corpus = []
        for result in results:
            corpus.extend(result.get())
        return corpus

    def inference(model, dataset):
        results = []
        pool = mlp.Pool(mlp.cpu_count())

        aver_t = int(len(dataset) / mlp.cpu_count()) + 1
        for i in range(mlp.cpu_count()):
            result = pool.apply_async(lda_infer,
                                      args=(dataset[i * aver_t:(i + 1) *
                                                    aver_t], model))
            results.append(result)
        pool.close()
        pool.join()

        topics = []
        for result in results:
            topics.extend(result.get())
        return np.array(topics)

    train = input.read_dataset('clean_train.csv')
    test = input.read_dataset('clean_test.csv')
    train['comment_text'] = train['comment_text'].fillna(
        replace_word['unknow'])
    test['comment_text'] = test['comment_text'].fillna(replace_word['unknow'])
    text = train['comment_text'].values.tolist(
    ) + test['comment_text'].values.tolist()

    text = tokenize_word(text)

    freq = {}
    for sentence in text:
        for word in sentence:
            if word not in freq:
                freq[word] = 0
            freq[word] += 1

    text = [[word for word in sentence if freq[word] > FILTER_FREQ]
            for sentence in tqdm(text)]

    dictionary = Dictionary(text)  # 生成 (id,word) 字典

    corpus = get_corpus(dictionary, text)
    print(len(corpus), len(corpus[0]))
    print('begin train lda')
    ldamodel = LdaMulticore(corpus=corpus,
                            num_topics=num_topics,
                            id2word=dictionary)

    print('inference')
    topic_probability_mat = inference(ldamodel, corpus)
    print(len(topic_probability_mat), len(topic_probability_mat[0]))

    train_sparse = topic_probability_mat[:train.shape[0]]
    test_sparse = topic_probability_mat[train.shape[0]:]

    # 计算有效成分有多少
    zero_section = {}
    for topics in tqdm(train_sparse):
        num = np.sum(topics == 0)
        num = str(int(num))
        if num not in zero_section:
            zero_section[num] = 0
        zero_section[num] += 1
    for topics in tqdm(test_sparse):
        num = np.sum(topics == 0)
        num = str(int(num))
        if num not in zero_section:
            zero_section[num] = 0
        zero_section[num] += 1
    print(zero_section)

    print('save')
    for i in range(num_topics):
        train['topic' + str(i)] = 0
        test['topic' + str(i)] = 0
    train[['topic' + str(i) for i in range(num_topics)]] = train_sparse
    test[['topic' + str(i) for i in range(num_topics)]] = test_sparse

    train.to_csv(PATH + 'clean_train.csv', index=False)
    test.to_csv(PATH + 'clean_test.csv', index=False)