def get_language(): "检测语言是否为中文" from Ref_Data import replace_word import json from langdetect import detect_langs from langdetect.lang_detect_exception import LangDetectException train = input.read_dataset('train.csv').fillna(replace_word['unknow']) test = input.read_dataset('test.csv').fillna(replace_word['unknow']) records = {} for index, row in tqdm(train.iterrows()): try: lang_prob = detect_langs(row['comment_text']) language = lang_prob[0].lang if language != 'en': records['tr' + str(index)] = (row['comment_text'], language, lang_prob[0].prob) except LangDetectException: records['tr' + str(index)] = (row['comment_text'], 'none',0) for index, row in tqdm(test.iterrows()): try: lang_prob = detect_langs(row['comment_text']) language = lang_prob[0].lang if language != 'en': records['te' + str(index)] = (row['comment_text'], language, lang_prob[0].prob) except LangDetectException: records['te' + str(index)] = (row['comment_text'], 'none',0) records = sorted(records.items(), key=lambda item: item[1][2], reverse=True) with open('language_record.json', 'w') as f: f.write(json.dumps(records, indent=4, separators=(',', ': '),ensure_ascii=False))
def tfidfFeature(n_components=CHAR_N): ''' TF-IDF Vectorizer ''' train = input.read_dataset('clean_train.csv') test = input.read_dataset('clean_test.csv') train['comment_text'] = train['comment_text'].fillna( replace_word['unknow']) test['comment_text'] = test['comment_text'].fillna(replace_word['unknow']) text = train['comment_text'].values.tolist( ) + test['comment_text'].values.tolist() def pca_compression(model_tfidf, n_components): np_model_tfidf = model_tfidf.toarray() pca = PCA(n_components=n_components) pca_model_tfidf = pca.fit_transform(np_model_tfidf) return pca_model_tfidf tfv = TfidfVectorizer(min_df=100, max_features=30000, strip_accents='unicode', analyzer='char', ngram_range=(2, 4), use_idf=1, smooth_idf=True, sublinear_tf=True) model_tfidf = tfv.fit_transform(text) # 获取pca后的np pca_model_tfidf = pca_compression(model_tfidf, n_components=n_components) # 获取添加特征名后的pd print(pca_model_tfidf.shape) cols = ["tfidf" + str(x) for x in range(n_components)] pca_model_tfidf = pd.DataFrame(pca_model_tfidf, columns=cols) for col in cols: pca_model_tfidf[col] = \ (pca_model_tfidf[col]-pca_model_tfidf[col].mean())/pca_model_tfidf[col].std() list_col = pca_model_tfidf[col].tolist() train[col] = list_col[:len(train)] test[col] = list_col[len(train):] print('save') train.to_csv(PATH + 'clean_train.csv', index=False) test.to_csv(PATH + 'clean_test.csv', index=False)
def add_comment(index,file): import input if file == 'te': dataset = input.read_dataset('test.csv') else: dataset = input.read_dataset('train.csv') with open('language_record.json') as f: comments = json.loads(f.read()) for i in index: comment = [ file+str(i), [ dataset.loc[i,'comment_text'], "add", 1 ] ] comments.append(comment) with open('language_record.json', 'w') as f: f.write(json.dumps(comments, indent=4, separators=(',', ': '),ensure_ascii=False))
def generator_char_vec(wordvecfile='crawl'): embeddings_index = input.read_wordvec(wordvecfile) words = embeddings_index.keys() chars = [] for w in words: chars.append(char_analyzer(w)) del embeddings_index train = input.read_dataset('clean_train.csv') test = input.read_dataset('clean_test.csv') text = train['char_text'].tolist() text += test['char_text'].tolist() import itertools, json corpus_chars = list(itertools.chain.from_iterable(text)) # 2维list展开成1维 corpus_chars += chars idx_to_char = list(set(corpus_chars)) char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)]) with open(PATH + 'char2index.json', 'w') as f: f.write(json.dumps(char_to_idx, indent=4, separators=(',', ': ')))
def char2idx(wordvecfile): train = input.read_dataset('clean_train.csv') test = input.read_dataset('clean_test.csv') train['comment_text'] = train['comment_text'].fillna( replace_word['unknow']) test['comment_text'] = test['comment_text'].fillna(replace_word['unknow']) text = train['comment_text'].values.tolist( ) + test['comment_text'].values.tolist() text = tokenize_word(text) input.read_wordvec(wordvecfile) def get_ch_seqs(text): results = [] pool = mlp.Pool(mlp.cpu_count()) comments = list(text) aver_t = int(len(text) / mlp.cpu_count()) + 1 for i in range(mlp.cpu_count()): result = pool.apply_async(batch_char_analyzer, args=(comments[i * aver_t:(i + 1) * aver_t], True)) results.append(result) pool.close() pool.join() ch_seqs = [] for result in results: char_seq = result.get() ch_seqs.extend(char_seq) return ch_seqs import itertools, json corpus_chars = list( itertools.chain.from_iterable(corpus_chars)) #2维list展开成1维 idx_to_char = list(set(corpus_chars)) char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)]) with open(PATH + 'char2index.json', 'w') as f: f.write(json.dumps(char_to_idx, indent=4, separators=(',', ': ')))
def get_char_text(): train = input.read_dataset('clean_train.csv') test = input.read_dataset('clean_test.csv') train['comment_text'] = train['comment_text'].fillna( replace_word['unknow']) test['comment_text'] = test['comment_text'].fillna(replace_word['unknow']) text = train['comment_text'].values.tolist( ) + test['comment_text'].values.tolist() text = tokenize_word(text) def get_ch_seqs(text): results = [] pool = mlp.Pool(mlp.cpu_count()) comments = list(text) aver_t = int(len(text) / mlp.cpu_count()) + 1 for i in range(mlp.cpu_count()): result = pool.apply_async(batch_char_analyzer, args=(comments[i * aver_t:(i + 1) * aver_t], True)) results.append(result) pool.close() pool.join() ch_seqs = [] for result in results: char_seq = result.get() ch_seqs.extend(char_seq) return ch_seqs seqs = get_ch_seqs(text) train['char_text'] = seqs[:len(train)] test['char_text'] = seqs[len(train):] train.to_csv(PATH + 'clean_train.csv', index=False) test.to_csv(PATH + 'clean_test.csv', index=False)
def pipeline( file=( 'train.csv', 'test.csv', # 'train_fr.csv','train_es.csv','train_de.csv' )): for filename in tqdm(file): dataset = input.read_dataset(filename) # dataset = translation_sub(dataset,filename[:2]) dataset.fillna(replace_word['unknow'], inplace=True) dataset = createFeature.countFeature(dataset) clean_dataset(dataset, 'clean_' + filename) createFeature.get_char_text() from ConvAIData import get_label_feature get_label_feature()
def get_other_language_train(lang='nl'): import multiprocessing as mlp from Ref_Data import replace_word,PATH dataset = input.read_dataset('train.csv') dataset.fillna(replace_word['unknow'], inplace=True) comments = dataset['comment_text'].tolist() results = [] pool = mlp.Pool(mlp.cpu_count()) aver_t = int(len(comments) / mlp.cpu_count()) + 1 for i in range(mlp.cpu_count()): result = pool.apply_async(get_other_lang_train, args=(comments[i * aver_t:(i + 1) * aver_t],lang)) results.append(result) pool.close() pool.join() translation = [] for result in results: translation.extend(result.get()) dataset['comment_text'] = translation dataset.to_csv(PATH+lang+'_train.csv',index=False)
def train_and_evaluate(args): """Train and evaluate custom Estimator with three training modes. Given the dictionary of parameters, create custom Estimator and run up to three training modes then return Estimator object. Args: args: Dictionary of parameters. Returns: Estimator object. """ # Create our custom estimator using our model function estimator = tf.estimator.Estimator( model_fn=anomaly_detection, model_dir=args["output_dir"], params={key: val for key, val in args.items()}) if args["training_mode"] == "reconstruction": if args["model_type"] == "pca": estimator.train(input_fn=read_dataset( filename=args["train_file_pattern"], mode=tf.estimator.ModeKeys.EVAL, batch_size=args["train_batch_size"], params=args), steps=None) else: # dense_autoencoder or lstm_enc_dec_autoencoder # Create early stopping hook to help reduce overfitting early_stopping_hook = tf.contrib.estimator.stop_if_no_decrease_hook( estimator=estimator, metric_name="rmse", max_steps_without_decrease=100, min_steps=1000, run_every_secs=60, run_every_steps=None) # Create train spec to read in our training data train_spec = tf.estimator.TrainSpec(input_fn=read_dataset( filename=args["train_file_pattern"], mode=tf.estimator.ModeKeys.TRAIN, batch_size=args["train_batch_size"], params=args), max_steps=args["train_steps"], hooks=[early_stopping_hook]) # Create eval spec to read in our validation data and export our model eval_spec = tf.estimator.EvalSpec( input_fn=read_dataset(filename=args["eval_file_pattern"], mode=tf.estimator.ModeKeys.EVAL, batch_size=args["eval_batch_size"], params=args), steps=None, start_delay_secs=args[ "start_delay_secs"], # start eval after N secs throttle_secs=args["throttle_secs"]) # evaluate every N secs # Create train and evaluate loop to train and evaluate our estimator tf.estimator.train_and_evaluate(estimator=estimator, train_spec=train_spec, eval_spec=eval_spec) else: # if args["training_mode"] == "calculate_error_distribution_statistics" # Get final mahalanobis statistics over the entire val_1 dataset # if args["training_mode"] == "tune_anomaly_thresholds" # Tune anomaly thresholds using val_2 and val_anom datasets train_spec = tf.estimator.TrainSpec( input_fn=read_dataset( filename=args["train_file_pattern"], mode=tf.estimator.ModeKeys.EVAL, # read through val data once batch_size=args["train_batch_size"], params=args), max_steps=args["train_steps"]) if args["training_mode"] == "calculate_error_distribution_statistics": # Evaluate until the end of eval files eval_steps = None # Don't create exporter for serving yet since anomaly thresholds # aren't trained yet exporter = None elif args["training_mode"] == "tune_anomaly_thresholds": if args["labeled_tune_thresh"]: # Evaluate until the end of eval files eval_steps = None else: # Don't evaluate eval_steps = 0 # Create exporter that uses serving_input_fn to create saved_model # for serving exporter = tf.estimator.LatestExporter( name="exporter", serving_input_receiver_fn=lambda: serving_input_fn(args[ "seq_len"])) else: print("{0} isn't a valid training mode!".format( args["training_mode"])) # Create eval spec to read in our validation data and export our model eval_spec = tf.estimator.EvalSpec( input_fn=read_dataset(filename=args["eval_file_pattern"], mode=tf.estimator.ModeKeys.EVAL, batch_size=args["eval_batch_size"], params=args), steps=eval_steps, exporters=exporter, start_delay_secs=args[ "start_delay_secs"], # start eval after N secs throttle_secs=args["throttle_secs"]) # evaluate every N secs if (args["training_mode"] == "calculate_error_distribution_statistics" or args["training_mode"] == "tune_anomaly_thresholds"): # Create train and evaluate loop to train and evaluate our estimator tf.estimator.train_and_evaluate(estimator=estimator, train_spec=train_spec, eval_spec=eval_spec)
def _try_params(n_iterations, batch_size, fun_shape, em_shape, db_path, lr, optimizer, scheduler, net3, tl,counter): "try some parameters, report testing accuracy with square loss" # read data x_train, y_train, x_test, y_test = read_dataset(db_path, batch_size) # initialize training/testing graph # initialize session sess = tf.Session() coord = tf.train.Coordinator() _, yhat_train, X = eval(net3)(X=x_train, fun_shape=fun_shape, em_shape=em_shape, sess=sess, coord=coord, tl=tl) #find accuracy om train data y_ = tf.expand_dims(y_train, 1) y__=y_ for i in range(em_shape[0]-1): y__=tf.concat([y__,y_],axis=1) yhat_predicted = tf.nn.softmax(yhat_train) correct_prediction = tf.equal(tf.argmax(yhat_predicted, 2), tf.argmax(y__, 2)) acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) #find loss train_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=yhat_train, labels=y__) #find accuracy on test data l0 = tf.expand_dims(x_test, 1) W1 = X[0] l1 = tf.reduce_sum(tf.expand_dims(l0, 3) * W1, axis=2) l1_act = tf.nn.relu(l1) W2 = X[1] l2 = tf.reduce_sum(tf.expand_dims(l1_act, 3) * W2, axis=2) l2_act = tf.nn.relu(l2) W3 = X[2] yhat_train = tf.reduce_sum(tf.expand_dims(l2_act, 3) * W3, axis=2) y_ = tf.expand_dims(y_test, 1) y__=y_ for i in range(em_shape[0]-1): y__=tf.concat([y__,y_],axis=1) yhat=tf.nn.softmax(yhat_train) correct_test_prediction = tf.equal(tf.argmax(yhat, 2), tf.argmax(y__, 2)) test_acc_ = tf.reduce_mean(tf.cast(correct_test_prediction, tf.float32)) lr_current = tf.placeholder(tf.float32) train_step = eval(optimizer)(learning_rate=lr_current).minimize(train_loss) sess.run(tf.global_variables_initializer()) tf.train.start_queue_runners(sess, coord) _train_losses = [] accuracy_container = [] test_acc_container = [] mean_accuracy = 0 for i in range(n_iterations): start = time.time() _train_loss, accuracy ,t_accuracy,_ = sess.run([train_loss, acc,test_acc_, train_step], feed_dict={lr_current: lr}) mean_accuracy += accuracy _train_losses.append(_train_loss) accuracy_container.append(accuracy) test_acc_container.append(t_accuracy) if scheduler == "none": pass elif scheduler == "dlp": # scheduler the step size if i % 2000 == 1999: if np.mean(_train_losses[-1000:]) >= np.mean(_train_losses[-2000:-1000]): lr = lr * 0.5 else: machine = socket.gethostname() if machine != "viacheslav-HP-Pavilion-Notebook": raise ValueError("unknown scheduler") else: pass if i % 100 == 0: # print "argmin of the train loss", _y_diff[_train_loss.argmin()], print("step:", i, "mean_loss", np.mean(_train_loss), "min_loss", np.min(_train_loss), ) # print "y_train:", np.mean(_y_train), np.var(_y_train),_y_train.shape, # print "y_hat:", np.mean(_yhat_train),np.var(_yhat_train), _yhat_train.shape, print("mean accuracy=", mean_accuracy, "%") print("lr", lr) print("exps:", cfg.batch_size / (time.time() - start)) mean_accuracy = 0 # history if i % 1000 == 1: _train_loss, = sess.run([train_loss], feed_dict={lr_current: lr}) sess.close() tf.reset_default_graph() np.savetxt(os.path.join(cfg.out_path, config_name + "_aam_train_accuracy_" + str(lr)+str(counter)), accuracy_container) np.savetxt(os.path.join(cfg.out_path, config_name + "_aam_test_accuracy_" + str(lr) + str(counter)),test_acc_container) sel_point = np.mean(_train_losses[-200:-100], axis=0).argmin() minmean_loss = np.mean(_train_losses[:-100], axis=0)[sel_point] loss_hist = np.asarray(_train_losses)[:, sel_point] return minmean_loss, loss_hist
def save_result(test_predicts,outputfile): list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] sample_submission = input.read_dataset('sample_submission.csv') sample_submission[list_classes] = test_predicts sample_submission.to_csv(outputfile, index=False, compression='gzip')
def splitTarget(filename): list_classes = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] labels = input.read_dataset(filename, list_classes) labels.to_csv(PATH + 'labels.csv', index=False)
def Boost(X,Y,test,para,setting,outputfile='boost.csv.gz',cal_weight = False): def _train_model(model, model_name,train_x, train_y,val_x, val_y, test,batchsize=BATCHSIZE, frequecy=50,init=30): from sklearn.metrics import roc_auc_score generator = tool.Generate(train_x, train_y, batchsize=frequecy * batchsize) epoch = 1 best_epoch = 1 best_score = -1 while True: samples_x, samples_y = generator.genrerate_samples() model.fit(samples_x, samples_y, batch_size=batchsize, epochs=1, verbose=0) if epoch >= init: # evaulate y_pred = model.predict(val_x, batch_size=2048, verbose=0) Scores = [] for i in range(6): score = roc_auc_score(val_y[:, i], y_pred[:, i]) Scores.append(score) cur_score = np.mean(Scores) print(cur_score) print(Scores) if epoch == init or best_score < cur_score: best_score = cur_score best_epoch = epoch print(best_score, best_epoch, '\n') result = y_pred model.save_weights(WEIGHT_FILE + model_name) elif epoch - best_epoch > 12: # patience 为5 model.load_weights(WEIGHT_FILE + model_name, by_name=False) test_pred = model.predict(test, batch_size=2048) return test_pred, result epoch += 1 def get_model(model_para): m = nnBlock.model( embedding_matrix=model_para['embedding_matrix'], trainable=model_para['trainable'], load_weight=model_para['load_weight'], loss=model_para['loss'], boost=model_para['boost'], ) m.get_layer(model_para['modelname']) return m def cv(model_para, X, Y, test, K=5,init=30,sample_weight=None): kf = KFold(len(Y), n_folds=K, shuffle=False) results = [] train_score = np.zeros((len(Y),6)) for i, (train_index, valid_index) in enumerate(kf): print('第{}次训练...'.format(i)) trainset = tool.splitdata(train_index, X) label_train = Y[train_index] validset = tool.splitdata(valid_index, X) label_valid = Y[valid_index] model = get_model(model_para) test_pred, val_score = _train_model(model, model_para['modelname'] + "_" + str(i) + ".h5", trainset, label_train, validset, label_valid, test,init=init) train_score[valid_index] = val_score results.append(test_pred) test_predicts = tool.cal_mean(results, None) return train_score,test_predicts train_score,test_score = cv(para,X,Y,test,init=30) para['boost'] = True if cal_weight: para['sample_weight'] = np.ones(len(Y)) for loss,model_name in setting: X['boost'] = train_score test['boost'] = test_score para['loss'] = loss para['modelname'] = model_name if cal_weight: para['sample_weight'] = get_weight(train_score,Y,para['sample_weight']) train_score,test_score = cv(para, X, Y, test,init=5) list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] sample_submission = input.read_dataset('sample_submission.csv') sample_submission[list_classes] = test_score sample_submission.to_csv(outputfile, index=False, compression='gzip')
def createKmeansFeature(usecols, name, k=6): from sklearn.cluster import KMeans train = input.read_dataset('clean_train.csv') test = input.read_dataset('clean_test.csv') data = train.append(test)[usecols].values # def distMeas(vecA, vecB): # return np.sqrt(np.sum(np.power(vecA - vecB, 2), axis=1)) # # def KMeans(dataSet, k): # """ # k-means 聚类算法 # 该算法会创建k个质心,然后将每个点分配到最近的质心,再重新计算质心。这个过程重复数次,直到数据点的簇分配结果不再改变为止。 # """ # def createRandCent(dataSet, k): # """ # 为给定数据集构建一个包含k个随机质心的集合。 # """ # n = dataSet.shape[1] # 列的数量 # feature_min = dataSet.min(axis=0) # 获取每个特征的下界 # feature_range = dataSet.max(axis=0) - feature_min # centroids = feature_min + feature_range * np.random.random((k, n)) # return centroids # # m = dataSet.shape[0] # 行数 # clusterAssment = np.zeros(m) # 创建一个与 dataSet 行数一样,但是有两列的矩阵,用来保存簇分配结果(一列簇索引值、一列误差) # centroids = createRandCent(dataSet, k) # 创建质心,随机k个质心 # distance = np.zeros((m, k)) # clusterChanged = True # while clusterChanged: # for j in range(k): # distance[:, j] = distMeas(centroids[j, :], dataSet) # # sample_cluster = distance.argmin(axis=1) # 获取所属的簇 # num_change = np.sum(clusterAssment != sample_cluster) # 有多少样本所属簇变了 # if num_change == 0: # clusterChanged = False # clusterAssment = sample_cluster # # for center in range(k): # 更新质心的位置 # ptsInClust = dataSet[clusterAssment == center] # 获取该簇中的所有点 # centroids[center, :] = np.mean(ptsInClust, axis=0) # # 处理nan # centroids = np.nan_to_num(centroids) # return centroids # samples = data[usecols].values # centroids = KMeans(samples ,k) # kMeans聚类 # for j in range(k): # k为质心数 # data["kmeans" + str(j + 1)] = \ # distMeas(centroids[j, :], samples) # 计算数据点到各个质心的距离 model = KMeans(6, max_iter=3000, tol=1e-6, n_jobs=-1) features = model.fit_transform(data) for i in range(k): train[name + '_kmean_' + str(i)] = features[:len(train), i] test[name + '_kmean_' + str(i)] = features[len(train):, i] train.to_csv(PATH + 'clean_train.csv', index=False) test.to_csv(PATH + 'clean_test.csv', index=False)
def get_pos_tag_vec(): from nltk import pos_tag train = input.read_dataset('clean_train.csv') test = input.read_dataset('clean_test.csv') train['comment_text'] = train['comment_text'].fillna( replace_word['unknow']) test['comment_text'] = test['comment_text'].fillna(replace_word['unknow']) text = train['comment_text'].values.tolist( ) + test['comment_text'].values.tolist() text = tokenize_word(text) def get_tag_text(text): results = [] pool = mlp.Pool(mlp.cpu_count()) comments = list(text) aver_t = int(len(text) / mlp.cpu_count()) + 1 for i in range(mlp.cpu_count()): result = pool.apply_async(get_tag, args=(comments[i * aver_t:(i + 1) * aver_t], pos_tag)) results.append(result) pool.close() pool.join() text_tag = [] word2tag = {} for result in results: t_tag, word_2_vec = result.get() text_tag.extend(t_tag) word2tag.update(word_2_vec) return text_tag, word2tag def getTfidfVector(clean_corpus, min_df=0, max_features=int(1e10), ngram_range=(1, 1), use_idf=False, sublinear_tf=True): def tokenizer(t): return t.split() tfv = TfidfVectorizer(min_df=min_df, max_features=max_features, tokenizer=tokenizer, strip_accents=None, analyzer="word", ngram_range=ngram_range, use_idf=use_idf, sublinear_tf=sublinear_tf) tag_tfidf = tfv.fit_transform(clean_corpus) return tag_tfidf, list(tfv.get_feature_names()) text_tag, word2tag = get_tag_text(text) import json with open(PATH + 'word2tag.json', 'w') as f: f.write(json.dumps(word2tag, indent=4, separators=(',', ': '))) tag_tfidf, columns = getTfidfVector(text_tag) n_components = POSTAG_DIM # 输出pca.lambda_ 选择99%的成分即可 pca = KernelPCA(n_components=n_components, kernel='rbf', n_jobs=-1) pca_tfidf = pca.fit_transform(tag_tfidf.transpose()).transpose() postag_vec = pd.DataFrame(pca_tfidf, columns=columns) postag_vec.to_csv(PATH + 'postagVec.csv', index=False)
def LDAFeature(num_topics=NUM_TOPIC): from gensim.corpora import Dictionary from gensim.models.ldamulticore import LdaMulticore def get_corpus(dictionary, text): results = [] pool = mlp.Pool(mlp.cpu_count()) comments = list(text) aver_t = int(len(text) / mlp.cpu_count()) + 1 for i in range(mlp.cpu_count()): result = pool.apply_async(doc2bow, args=(comments[i * aver_t:(i + 1) * aver_t], dictionary)) results.append(result) pool.close() pool.join() corpus = [] for result in results: corpus.extend(result.get()) return corpus def inference(model, dataset): results = [] pool = mlp.Pool(mlp.cpu_count()) aver_t = int(len(dataset) / mlp.cpu_count()) + 1 for i in range(mlp.cpu_count()): result = pool.apply_async(lda_infer, args=(dataset[i * aver_t:(i + 1) * aver_t], model)) results.append(result) pool.close() pool.join() topics = [] for result in results: topics.extend(result.get()) return np.array(topics) train = input.read_dataset('clean_train.csv') test = input.read_dataset('clean_test.csv') train['comment_text'] = train['comment_text'].fillna( replace_word['unknow']) test['comment_text'] = test['comment_text'].fillna(replace_word['unknow']) text = train['comment_text'].values.tolist( ) + test['comment_text'].values.tolist() text = tokenize_word(text) freq = {} for sentence in text: for word in sentence: if word not in freq: freq[word] = 0 freq[word] += 1 text = [[word for word in sentence if freq[word] > FILTER_FREQ] for sentence in tqdm(text)] dictionary = Dictionary(text) # 生成 (id,word) 字典 corpus = get_corpus(dictionary, text) print(len(corpus), len(corpus[0])) print('begin train lda') ldamodel = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dictionary) print('inference') topic_probability_mat = inference(ldamodel, corpus) print(len(topic_probability_mat), len(topic_probability_mat[0])) train_sparse = topic_probability_mat[:train.shape[0]] test_sparse = topic_probability_mat[train.shape[0]:] # 计算有效成分有多少 zero_section = {} for topics in tqdm(train_sparse): num = np.sum(topics == 0) num = str(int(num)) if num not in zero_section: zero_section[num] = 0 zero_section[num] += 1 for topics in tqdm(test_sparse): num = np.sum(topics == 0) num = str(int(num)) if num not in zero_section: zero_section[num] = 0 zero_section[num] += 1 print(zero_section) print('save') for i in range(num_topics): train['topic' + str(i)] = 0 test['topic' + str(i)] = 0 train[['topic' + str(i) for i in range(num_topics)]] = train_sparse test[['topic' + str(i) for i in range(num_topics)]] = test_sparse train.to_csv(PATH + 'clean_train.csv', index=False) test.to_csv(PATH + 'clean_test.csv', index=False)