def main(): predict_result_file = from_project_root( "lstm_model/result/result_predict-1700.csv") distribution_file = from_project_root( "lstm_model/result/erro_distribution-1700.csv") get_the_error_label_distribution(predict_result_file, distribution_file) pass
def main(result_dir): all_predict_files = read_all_filenames(from_project_root(result_dir)) all_predict_results = [] for predict_file in all_predict_files: all_predict_results.append(pk.load(open(predict_file,'rb'))) predict_all = [] predict_all_pro = [] for i in range(len(all_predict_results[0])): predict_one_merge = np.array([0.0] * 19) for j in range(len(all_predict_results)): predict_one_merge = predict_one_merge + np.array(all_predict_results[j][i]) # predict_one_merge = predict_one_merge / len(all_predict_results) max_index = np.where(predict_one_merge == np.max(predict_one_merge))[0][0] predict_all.append(max_index + 1) predict_all_pro.append(predict_one_merge) predict_context, predict_labels = Data_helper.get_predict_data(from_project_root( "lstm_model/processed_data/one_gram/filter-1gram_phrase_level_data_200_dev.csv")) macro_f1 = f1_score(predict_all, predict_labels, average='macro') accuracy_score1 = accuracy_score(predict_all, predict_labels, normalize=True) print("macro_f1:{}".format(macro_f1)) print("accuracy:{}".format(accuracy_score1)) # save pk.dump(predict_all_pro,open(result_dir+"/predict_merge_dev.pk",'wb'))
def main(): # exit() #计算df # train_file = from_project_root("lstm_model/processed_data/phrase_level_data.csv") # df_pickle = from_project_root("lstm_model/processed_data/one_gram/phrase_level_df.pk") # cal_df(train_file,df_pickle) # exit() # 将词袋模型的tf_bdc权重进行降维 # tfbdc_word_bag_pickle = from_project_root("lstm_model/processed_data/vector/tfbdc_1gram_300000_Xy.pk") # pca_tfbdc_pickle = from_project_root("lstm_model/processed_data/vector/pca_tfbdc_1gram_300000_Xy.csv") # pca(tfbdc_word_bag_pickle,pca_tfbdc_pickle) # exit() # 根据train_file建立字典 # train_file = from_project_root("lstm_model/processed_data/one_gram/filter-1gram_phrase_level_data_200.csv") # vocab_pickle = from_project_root("lstm_model/processed_data/one_gram/filter-1gram_phrase_level_data_200_vocab.pk") # create_vocab_dict(train_file,vocab_pickle) # exit() # 将pk转化为csv文件 pickle_file = from_project_root( "lstm_model/processed_data/one_gram/filter-1gram_phrase_level_data_200_vocab.pk" ) save_csv_file = from_project_root( "lstm_model/processed_data/one_gram/filter-1gram_phrase_level_data_200_vocab.csv" ) transfer_pk_to_csv(pickle_file, save_csv_file) exit()
def main(): # 将one-gram转变成n-gram # n_gram = 2 # phrase_train_file = from_project_root("lstm_model/processed_data/phrase_level_data.csv") # n_gram_phrase_train_file = from_project_root("lstm_model/processed_data/two_gram/{}-gram_phrase_level_data.csv".format(n_gram)) # create_n_gram_sentence(n_gram,phrase_train_file,n_gram_phrase_train_file) # exit() # 对于每个句子进行过滤 bdc_pickle = from_project_root( "lstm_model/processed_data/one_gram/phrase_level_1gram_bdc.json") tf_pickle = from_project_root( "lstm_model/processed_data/one_gram/phrase_level_1gram_tf.json") dc_pickle = from_project_root( "lstm_model/processed_data/one_gram/phrase_level_1gram_dc.json") train_file = from_project_root( "lstm_model/processed_data/phrase_level_data.csv") processed_data_file = from_project_root( "lstm_model/processed_data/one_gram/filter-1gram_phrase_level_data_200.csv" ) pre_processed_sen(bdc_pickle, tf_pickle, dc_pickle, train_file, processed_data_file, limit_word=200) pass
def main(): model_url = from_project_root( "data/model/exhaustive_model_epoch16_0.723039.pt") test_url = from_project_root("data/genia.test.iob2") model = torch.load(model_url) evaluate(model, test_url) pass
def main(name): # load data from pickle pk_url = from_project_root("processed_data/vector/stacked_dc_idf_" + name + "_36.pk") print("loading data from", pk_url) X, y, X_test = joblib.load(pk_url) train_url = from_project_root("data/multilabel_" + name + ".csv") test_url = from_project_root("data/test_processed.csv") print(X.shape, y.shape, X_test.shape) clf = XGBClassifier(n_jobs=-1) # xgb's default n_jobs=1 result = get_result_from_stacking(clf, X, y, X_test) test_public = pd.read_csv(test_url)['id'] output_str = 'content_id,subject,sentiment_value,sentiment_word\n' for jjj in range(len(result)): output_str += "%s,0,%s,\n" % (test_public[jjj], result[jjj]) outfile = open('result_36' + name + '.csv', 'w') outfile.write(output_str) outfile.close() save_url = from_project_root( "processed_data/vector/{}_dc_idf_xgb.pk".format(X.shape[1] // N_CLASSES)) joblib.dump( gen_data_for_stacking(clf, X, y, X_test, n_splits=5, random_state=19950717, name=name), save_url) pass
def gen_feature_stacking_result(gen_type='val'): """ generate feature stacking result data Args: gen_type: val or test Returns: X, y, X_test """ params = load_params() print("len(params) =", len(params)) save_url = from_project_root("data/vector/stacked_%s_XyX_%s_%d_%sc.pk" % (('one' if ONLY_SINGLE else 'all'), gen_type, len(load_params()), LABEL_COL)) print("stacking data will be saved at", save_url) if gen_type == 'val': train_url = from_project_root("data/preliminary/train_ex.csv") test_url = from_project_root("data/preliminary/test_gold_ex.csv") # train_url = from_project_root("data/preliminary/train_exs.csv") # test_url = from_project_root("data/preliminary/best_subject_exs.csv") elif gen_type == 'test': train_url = from_project_root("data/train_2_ex.csv") test_url = from_project_root("data/test_public_2v3_ex.csv") else: print("error, gen_type should be 'val' or 'test'") return joblib.dump(feature_stacking(train_url, test_url, use_proba=True, random_state=RANDOM_STATE, drop_words=DROP_WORDS, only_single=ONLY_SINGLE), save_url)
def main(): proba_dict = { from_project_root('processed_data/result/result1.csv'): 0.1, from_project_root('processed_data/result/result2.csv'): 0.9, } save_url = from_project_root('processed_data/result.csv') merge_probas(proba_dict, save_url) pass
def main(): data_urls = [ from_project_root("data/genia.train.iob2"), from_project_root("data/genia.dev.iob2"), from_project_root("data/genia.test.iob2") ] prepare_vocab(data_urls, update=True, min_count=1) pass
def calc_bdc(data_url=DATA_URL, update=False, ngram=1): """ calc the bdc value of all tokens Args: data_url: url to data file update: update dict even it exists ngram: maxn for ngram Returns: dict: bdc dict {word: bdc_value} """ level = 'phrase' if 'phrase' in data_url else 'word' bdc_url = from_project_root( "processed_data/saved_weight/{}_level_{}gram_bdc.json".format( level, ngram)) dc_url = from_project_root( "processed_data/saved_weight/{}_level_{}gram_dc.json".format( level, ngram)) if not update and exists(bdc_url): return ju.load(bdc_url) labels, sentences = load_raw_data(data_url, ngram=ngram) word_label_dict = collections.defaultdict(dict) # store f(t, c_i) label_words_num = collections.defaultdict(int) # to store all f(c_i) for label, sentence in tqdm(zip(labels, sentences), total=len(labels)): label_words_num[label] += len(sentence) for word in sentence: try: word_label_dict[word][label] += 1 except KeyError: word_label_dict[word][label] = 1 bdc_dict = collections.defaultdict(float) dc_dict = collections.defaultdict(float) for word in tqdm(word_label_dict): # for calc dc arr = np.array(list( word_label_dict[word].values())) # f(t, c_i) for all labels arr = arr / arr.sum() # f(t, c_i) / f(t) arr = np.log(arr) * arr dc_dict[word] = 1 + arr.sum() / np.log(len(label_words_num)) # norm # for calc bdc for label in word_label_dict[word]: word_label_dict[word][label] /= label_words_num[ label] # p(t, c_i) = f(t, c_i) / f(c_i) arr = np.array(list( word_label_dict[word].values())) # p(t, c_i) for all labels arr = arr / arr.sum() # p(t, c_i) / sum(p(t, c_i)) arr = np.log(arr) * arr bdc_dict[word] = 1 + arr.sum() / np.log(len(label_words_num)) # norm # to sort save calculated result ju.dump(ju.sort_dict_by_value(bdc_dict), bdc_url) ju.dump(ju.sort_dict_by_value(dc_dict), dc_url) return bdc_dict
def main(): # load_data(data_url, update=False) data_urls = [ from_project_root("data/Germ/germ.train.iob2"), from_project_root("data/Germ/germ.dev.iob2"), from_project_root("data/Germ/germ.test.iob2") ] prepare_vocab(data_urls, update=True, min_count=1) pass
def main(): model_url = from_project_root("data/model/best_model.pt") print("loading model from", model_url) model = torch.load(model_url) # model = torch.load(model_url, map_location='cpu') test_url = from_project_root("data/genia/genia.test.iob2") evaluate(model, test_url) # predict_on_iob2(model, test_url) pass
def split_data(param_data_df): """ 划分数据集 :param param_data_df: :return: """ train_df, validation_df = train_test_split(param_data_df, test_size=0.2) train_filename = from_project_root("data/small_train.csv") test_filename = from_project_root(("data/small_test.csv")) write_data_df(train_filename, train_df) write_data_df(test_filename, validation_df)
def rcnn_rcnn_attention(): rcnn_model = from_project_root( "lstm_model/result/prob_rcnnon_cv0.789744.csv") rnn_cnn_attention = from_project_root( "lstm_model/result/result_rcnn_0.775.pk") rcnn_pro = [] with open(rcnn_model, 'r', encoding='utf-8') as f: for line in f.readlines()[1:]: pro_list = np.array(line.strip().split(',')[:-1]).astype( np.float32) rcnn_pro.append(pro_list) rcnn_attention_pro = pk.load(open(rnn_cnn_attention, 'rb')) predict_merge = [] predict_pro_merge = [] for i in range(len(rcnn_attention_pro)): # 预测样本的条数 one_predict_merge = (np.array(rcnn_pro[i]) / 5 + np.array(rcnn_attention_pro[i])) / 2 predict_pro_merge.append(one_predict_merge) max_index = np.where( one_predict_merge == np.max(one_predict_merge))[0][0] print(max_index) predict_merge.append(max_index + 1) # 这就是最终的预测结果pro_merge predict_context, ids = Data_helper.get_predict_data( from_project_root( "lstm_model/processed_data/phrase_level_test_data.csv")) # 保存 pk.dump( predict_pro_merge, open( from_project_root( "lstm_model/result/pro_rcnn_rnn_cnn_attention_0.794.pk"), 'wb')) # 保存结果 with open(from_project_root( "lstm_model/result/result_rcnn_rnn_cnn_attention.csv"), 'w', encoding='utf-8') as f: f.write("id,class\n") for i in range(len(ids)): f.write("{},{}\n".format(ids[i], predict_merge[i])) pass
def read_data_df(filename, data_type): """ 分块读取文件 :param filename: :param data_type: :return: """ filename = from_project_root(filename) if data_type == "train": data_df = pd.read_csv(filename, chunksize=10000, dtype={ "id": str, "article": str, "word_seg": str, "class": np.int }, engine="c") elif data_type == "test": data_df = pd.read_csv(filename, chunksize=10000, dtype={ "id": str, "article": str, "word_seg": str }, engine="c") tr_list = [] for tr in data_df: tr_list.append(tr) data_df = pd.concat(tr_list) return data_df
def main(): json_url = from_project_root("processed_data/entity2contents.json") json_data = json_util.load(json_url) print(json_data["红楼梦"]) exit()
def load_raw_data(data_url, ngram=1): """ load data to get labels list and sentences list, set ngram=None if you want every sentence to be a space separated string instead of ngram list Args: data_url: url to data file ngram: generate ngram in sentence Returns: (list, list): labels and sentences """ if not exists(data_url): generate_level_data(from_project_root("data/train_set.csv")) with open(data_url, "r", encoding="utf-8") as data_file: labels = list() sentences = list() print("loading data from \n ", data_url) s_time = time() for line in data_file: line = line.split(',') labels.append(int(line[0])) if ngram is not None: sentences.append(sentence_to_ngram(line[1], ngram)) else: sentences.append(line[1]) e_time = time() print("finished loading in %.3f seconds\n" % (e_time - s_time)) return labels, sentences
def gen_rematch_val(): """ Use train data of rematch to generate gold result of test data in preliminary """ train_df = pd.read_csv(from_project_root("data/train_2.csv")) test_df = pd.read_csv( from_project_root("data/preliminary/test_public.csv")) val_df = test_df.merge(train_df, on='content') \ .drop(columns=['content_id_y']) \ .rename(columns={'content_id_x': 'content_id'}) val_df.to_csv(from_project_root('data/preliminary/test_gold.csv'), index=False) test_df = pd.read_csv(from_project_root("data/test_public_2.csv")) test_df = test_df[~test_df['content_id'].isin(val_df['content_id'])] test_df.to_csv('data/test_2.csv', index=False)
def transform(train_url=TRAIN_URL, test_url=None, column='word_seg', tw_type=TW_TYPE): """ Args: column: column to use train_url: str, url to train data (with header) test_url: url to test data tw_type: str, term wighting type {idf, dc, bdc} Returns: X, y, X_test: vertorized data """ data_url = from_project_root("processed_data\phrase_level_data.csv") if column == 'article': data_url = data_url.replace('phrase', 'word') if tw_type == 'idf': return tfidf_transform(train_url, test_url, column=column) elif tw_type == 'dc': dc_dict = cw.calc_dc(data_url, ngram=MAX_N) return dict_transform(dc_dict, train_url, test_url, column=column) elif tw_type == 'bdc': bdc_dict = cw.calc_bdc(data_url, ngram=MAX_N) return dict_transform(bdc_dict, train_url, test_url, column=column)
def main(): print("data generating...") xy_url = from_project_root( "processed_data/vector/{}_tf{}_{}gram_{}_XyN.pk".format( COLUMN, TW_TYPE, MAX_N, MAX_FEATURES)) # test_url = None test_url = from_project_root('data/test_set.csv') if test_url: xy_url.replace('XyN', 'XyX_test') print("generated (X, y, X_test) will be saved at", xy_url) X, y, X_test = transform(TRAIN_URL, test_url, column=COLUMN, tw_type=TW_TYPE) joblib.dump((X, y, X_test), xy_url) pass
def ft_process(data_url=None): """ process data into what ft model need, and save it into './processed_data' dir Args: data_url: url to original .csv data Returns: str: url to saved processed data """ save_filename = basename(data_url).replace('.csv', '_ft.csv') save_url = from_project_root("embedding_model/processed_data/" + save_filename) # file specified by data_url is already processed if exists(save_url): return save_url if data_url is not None: labels, sentences = load_raw_data(data_url) else: train_df = load_to_df(TRAIN_URL) labels = train_df['class'].values sentences = train_df['word_seg'] with open(save_url, "w", encoding='utf-8', newline='\n') as ft_file: for i in range(len(labels)): label = FT_LABEL_PREFIX + str(labels[i]) sentence = ' '.join(sentences[i]) ft_file.write('{} {}\n'.format(label, sentence)) return save_url
def main(): params = load_params() print("len(params) =", len(params)) save_url = from_project_root( "processed_data/vector/stacked_dc_idf_lsvc_%d.pk" % len(load_params())) joblib.dump(feature_stacking(use_proba=True, random_state=RANDOM_STATE), save_url)
def evaluate(pred_url, use_senti=True): """ evaluate result file of preliminary test data Args: pred_url: str, url of predicted result file use_senti: bool, use sentiment_value column or not """ usecols = ['content_id', 'subject'] if use_senti: usecols.append('sentiment_value') true_df = pd.read_csv(from_project_root('data/preliminary/test_gold.csv'), usecols=usecols) pred_df = pd.read_csv(pred_url, usecols=usecols) # tp:判断正确的数量; # fp:判断错误或多判的数量; # fn;漏判的数量; tp = len(true_df.merge(pred_df, on=usecols)) fp = len(pred_df) - tp fn = len(true_df) - tp print("metrics on test set of preliminary%s:" % ("" if use_senti else " without sentiment")) print(" tp = %d, fp = %d, fn = %d, n_samples = %d" % (tp, fp, fn, tp + fn)) recall = tp / (tp + fn) precision = tp / (tp + fp) micro_f1 = 2 * recall * precision / (recall + precision) print(" recall = %f, precision = %f, micro_f1 = %f\n" % (recall, precision, micro_f1))
def predict(pro_file): data = pk.load(open(pro_file,'rb')) result = [] for predict_one_merge in data: max_index = np.where(predict_one_merge == np.max(predict_one_merge))[0][0] result.append(max_index+1) predict_context, ids = Data_helper.get_predict_data( from_project_root("lstm_model/processed_data/phrase_level_test_data.csv")) # 保存结果 with open(from_project_root("hierarchicalAttention_Model/result/result_rcnn_rcnn_atten_han_5cv.csv"), 'w', encoding='utf-8') as f: f.write("id,class\n") for i in range(len(ids)): f.write("{},{}\n".format(ids[i], result[i]))
def one_hot(param_data, sentence_type): """ 计算每个词对应的one_hot值 :param param_data: :param sentence_type: :return: """ word_dictionary = [] data = None if sentence_type == "phrase": data = param_data["word_seg"].values elif sentence_type == "word": data = param_data["article"].values for sentence in tqdm(data): word_list = sentence.split(" ") word_list_only = list(set(word_list)) word_dictionary.extend(word_list_only) word_dictionary_count = Counter(word_dictionary) word_dictionary_only = list(word_dictionary_count.items()) word_value = [1] * len(word_dictionary_only) word_df = pd.DataFrame(word_dictionary_only, columns=["word", "count"]) word_df["weight"] = word_value filename = "processed_data/csv_weight/" + sentence_type + "_level_one_hot.csv" filename = from_project_root(filename) word_df.to_csv(filename, index=False)
def calc_tf(data_url=DATA_URL, update=False, ngram=1): """ calc the tf value of all tokens Args: data_url: url to data file update: update dict even it exists ngram: max_n for ngram Returns: dict: tf dict {word: tf_value} """ level = 'phrase' if 'phrase' in data_url else 'word' tf_url = from_project_root( "processed_data/saved_weight/{}_level_{}gram_tf.json".format( level, ngram)) if not update and exists(tf_url): return ju.load(tf_url) tf_dict = collections.defaultdict(int) _, sentences = load_raw_data(data_url, ngram=ngram) for sentence in tqdm(sentences): for word in sentence: tf_dict[word] += 1 ju.dump(ju.sort_dict_by_value(tf_dict, reverse=True), tf_url)
def validate(pkl_url=None, cv=5, evaluating=False): """ do validating Args: pkl_url: load data from pickle file, set to None to generate data instantly cv: do cross validation or not evaluating: whether to do evaluating on test_gold """ clfs = init_clfs() val_url = from_project_root("data/preliminary/test_gold_ex.csv") if pkl_url is not None: # load from pickle print("loading data from", pkl_url) X, y, X_val = joblib.load(pkl_url) else: train_url = from_project_root("data/preliminary/train_ex.csv") # generate from original csv X, y, X_val = generate_vectors(train_url, val_url, column='article', max_n=3, min_df=3, max_df=0.8, max_features=20000, trans_type='dc', sublinear_tf=True, balanced=True, multilabel_out=False, label_col='subjects', only_single=True, shuffle=True) print("data shapes:\n", X.shape, y.shape, X_val.shape) for name, clf in clfs.items(): if len(y.shape) > 1: clf = OneVsRestClassifier(clf) print("cross validation on %s is running" % name) validate_clf(clf, X, y, cv=5, scoring='f1_micro') if evaluating: print("metrics of %s classifier:" % name) clf.fit(X, y) y_true = pd.read_csv(val_url, usecols=list(map( str, range(10)))).values < 2 y_pred = clf.predict(X_val) y_probas = predict_proba(clf, X_val) calc_metrics(y_true, y_pred, y_probas)
def feature_stacking(n_splits=CV, random_state=None, use_proba=False, verbose=False, drop_words=DROP_WORDS): """ Args: n_splits: n_splits for KFold random_state: random_state for KFlod use_proba: True to predict probabilities of labels instead of labels verbose: True to print more info drop_words: drop_words for run_parallel Returns: X, y, X_test """ # clf = OneVsRestClassifier(SVC(kernel='linear', probability=True)) # multilabel clf = OneVsRestClassifier(LinearSVCP()) # LinearSVC for multilabel # train_url = from_project_root("data/multilabel.csv") # test_url = from_project_root("data/test_processed.csv") train_url = from_project_root("../data/multilabel.csv") test_url = from_project_root("../data/test_processed.csv") # test_url = None X, y, X_test = generate_vectors(train_url, test_url, sublinear_tf=False) # for X.shape params_list = load_params() parallel = joblib.Parallel(n_jobs=N_JOBS, verbose=True) rets = parallel( joblib.delayed(run_parallel) (ind, train_url, test_url, params, clf, n_splits, random_state, use_proba, verbose, drop_words) for ind, params in enumerate(params_list)) rets = sorted(rets, key=lambda x: x[0]) X_stack_train = np.empty((X.shape[0], 0), float) X_stack_test = np.empty((X_test.shape[0], 0), float) for ind, y_pred, y_pred_test in rets: X_stack_train = np.append(X_stack_train, y_pred, axis=1) X_stack_test = np.append(X_stack_test, y_pred_test, axis=1) return X_stack_train, y, X_stack_test
def main(): train_data_file = from_project_root( "lstm_model/processed_data/one_gram/filter-1gram_phrase_level_data_200.csv" ) n_parts = 5 dev_nums = 5000 split_data_to_parts(train_data_file, n_parts, dev_nums) pass
def main(): kwargs = { 'size': 300, 'min_count': 5, 'window': 5, 'iter': 5, 'sg': 1, 'hs': 1 } model = train_w2v_model(data_url=None, kwargs=kwargs) print(len(model.wv.vocab)) wv_url = from_project_root( "embedding_model/models/wv_word_seg_300_5_5_5_1_1.txt") save_url = from_project_root("processed_data/vector/avg_wvs_300.pk") gen_data_for_clf(wv_url, save_url=save_url) pass