def collect_full_and_split(): class_dict_concept = dataloader.load_class_dict( class_file=config.chen14_elec_class_label_path, class_code_column="ConceptNet", class_name_column="ClassCode", ) class_dict_label = dataloader.load_class_dict( class_file=config.chen14_elec_class_label_path, class_code_column="ClassLabel", class_name_column="ConceptNet", ) class_dict_count = dataloader.load_class_dict( class_file=config.chen14_elec_class_label_path, class_code_column="ClassLabel", class_name_column="Count", ) train_df = pd.DataFrame(columns=["class", "text"]) test_df = pd.DataFrame(columns=["class", "text"]) df = pd.DataFrame(columns=["class", "text"]) index_df = 0 for idx, class_dir in enumerate(data["target_names"]): print(idx, class_dir) order = list(range(class_dict_count[class_dir])) random.shuffle(order) test_order = order[:int(class_dict_count[class_dir] * 0.3)] class_samples = [ x for xid, x in enumerate(data["X"]) if data["y"][xid] == idx ] for lidx, content in enumerate(class_samples): if lidx in test_order: test_df.loc[index_df] = [ class_dict_concept[class_dict_label[class_dir]], content ] else: train_df.loc[index_df] = [ class_dict_concept[class_dict_label[class_dir]], content ] df.loc[index_df] = [ class_dict_concept[class_dict_label[class_dir]], content ] index_df += 1 train_df = train_df.reset_index(drop=True) test_df = test_df.reset_index(drop=True) df = df.reset_index(drop=True) train_df.to_csv("../data/chen14/clean_elec/train.csv") test_df.to_csv("../data/chen14/clean_elec/test.csv") df.to_csv("../data/chen14/clean_elec/full.csv")
def augment_train(class_label_path, train_augmented_path, train_path, nott): global POS_OF_WORD, WORD_TOPIC_TRANSLATION if os.path.isfile(config.POS_OF_WORD_path): POS_OF_WORD = pickle.load(open(config.POS_OF_WORD_path, "rb")) if os.path.isfile(config.WORD_TOPIC_TRANSLATION_path): WORD_TOPIC_TRANSLATION = pickle.load( open(config.WORD_TOPIC_TRANSLATION_path, "rb")) class_dict = dataloader.load_class_dict(class_file=class_label_path, class_code_column="ClassCode", class_name_column="ClassWord") fieldnames = ['No.', 'from_class', 'to_class', 'text'] csvwritefile = open(train_augmented_path, 'w', encoding="latin-1", newline='') writer = csv.DictWriter(csvwritefile, fieldnames=fieldnames) writer.writeheader() with open(train_path, encoding="latin-1") as csvfile: reader = csv.DictReader(csvfile) rows = list(reader) random.shuffle(rows) if nott is not None: # no. of texts to be translated rows = rows[:min(nott, len(rows))] count = 0 with progressbar.ProgressBar(max_value=len(rows)) as bar: for idx, row in enumerate(rows): text = row['text'] class_id = int(row['class']) class_name = class_dict[class_id] for cidx in class_dict: if cidx != int(row['class']): try: writer.writerow({ 'No.': count, 'from_class': class_id, 'to_class': cidx, 'text': topic_transfer(text, from_class=class_name, to_class=class_dict[cidx]) }) count += 1 except: continue bar.update(idx) if idx % 100 == 0: pickle.dump(POS_OF_WORD, open(config.POS_OF_WORD_path, "wb")) pickle.dump(WORD_TOPIC_TRANSLATION, open(config.WORD_TOPIC_TRANSLATION_path, "wb")) csvwritefile.close()
def generate_random_group(classfilename, outfilename, unseen_rate, group_num): if os.path.exists(outfilename): commandin = input( "Random group file already exists, sure to move on? [y/n]") if commandin[0] == "y": pass else: return class_dict = dataloader.load_class_dict(class_file=classfilename, class_code_column="ClassCode", class_name_column="ConceptNet") num_class = len(class_dict.keys()) num_unseen_class = int(num_class * unseen_rate) class_id_list = list(class_dict.keys()) unseen_class_num_chosen = dict() for class_id in class_dict: unseen_class_num_chosen[class_id] = 0 with open(outfilename, "w") as f: for g in range(group_num): unseen_class = random.sample(class_id_list, k=num_unseen_class) for class_id in unseen_class: unseen_class_num_chosen[class_id] += 1 seen_class = list() for class_id in class_id_list: if class_id not in unseen_class: seen_class.append(class_id) rgstr = "%s|%s" % (",".join(str(_) for _ in seen_class), ",".join( str(_) for _ in unseen_class)) print(rgstr) f.write(rgstr + "\n") print("\n".join("%d:%d" % (t[0], t[1]) for t in sorted( unseen_class_num_chosen.items(), key=lambda x: x[1], reverse=True)))
def run_amazon(): random_group = dataloader.get_random_group(config.chen14_elec_class_random_group_path) vocab = dataloader.build_vocabulary_from_full_corpus( config.chen14_elec_full_data_path, config.chen14_elec_vocab_path, column="text", force_process=False, min_word_count=10 ) glove_mat = dataloader.load_glove_word_vector( config.word_embed_file_path, config.chen14_elec_word_embed_matrix_path, vocab, force_process=False ) assert np.sum(glove_mat[vocab.start_id]) == 0 assert np.sum(glove_mat[vocab.end_id]) == 0 assert np.sum(glove_mat[vocab.unk_id]) == 0 assert np.sum(glove_mat[vocab.pad_id]) == 0 class_dict = dataloader.load_class_dict( class_file=config.chen14_elec_class_label_path, class_code_column="ClassCode", class_name_column="ConceptNet" ) print("Check NaN in csv ...") check_nan_train = dataloader.check_df(config.chen14_elec_train_path) check_nan_test = dataloader.check_df(config.chen14_elec_test_path) print("Train NaN %s, Test NaN %s" % (check_nan_train, check_nan_test)) assert not check_nan_train assert not check_nan_test train_class_list = dataloader.load_data_class( filename=config.chen14_elec_train_path, column="class", ) train_text_seqs = dataloader.load_data_from_text_given_vocab( config.chen14_elec_train_path, vocab, config.chen14_elec_train_processed_path, column="text", force_process=False ) test_class_list = dataloader.load_data_class( filename=config.chen14_elec_test_path, column="class", ) test_text_seqs = dataloader.load_data_from_text_given_vocab( config.chen14_elec_test_path, vocab, config.chen14_elec_test_processed_path, column="text", force_process=False ) lenlist = [len(text) for text in test_text_seqs] + [len(text) for text in train_text_seqs] print("Avg length of documents: ", np.mean(lenlist)) print("95% length of documents: ", np.percentile(lenlist, 95)) print("90% length of documents: ", np.percentile(lenlist, 90)) print("80% length of documents: ", np.percentile(lenlist, 80)) for i, rgroup in enumerate(random_group): max_length = 200 with tf.Graph().as_default() as graph: tl.layers.clear_layers_name() mdl = model_seen.Model4Seen( model_name="seen_full_chen14_elec_vwonly_random%d_unseen%s_max%d_cnn" \ % (i + 1, "-".join(str(_) for _ in rgroup[1]), max_length), start_learning_rate=0.0001, decay_rate=0.5, decay_steps=10000, max_length=max_length, number_of_seen_classes=len(rgroup[0]) ) ctl = Controller4Seen( model=mdl, vocab=vocab, class_dict=class_dict, word_embed_mat=glove_mat, random_unseen_class=False, random_unseen_class_list=rgroup[1], base_epoch=-1, ) ctl.controller(train_text_seqs, train_class_list, test_text_seqs, test_class_list, train_epoch=100) ctl.controller4test(test_text_seqs, test_class_list, unseen_class_list=ctl.unseen_class, base_epoch=100) ctl.sess.close()
def run_20news(): random_group = dataloader.get_random_group(config.news20_class_random_group_path) vocab = dataloader.build_vocabulary_from_full_corpus( config.news20_full_data_path, config.news20_vocab_path, column="text", force_process=False, min_word_count=10 ) glove_mat = dataloader.load_glove_word_vector( config.word_embed_file_path, config.news20_word_embed_matrix_path, vocab, force_process=False ) assert np.sum(glove_mat[vocab.start_id]) == 0 assert np.sum(glove_mat[vocab.end_id]) == 0 assert np.sum(glove_mat[vocab.unk_id]) == 0 assert np.sum(glove_mat[vocab.pad_id]) == 0 class_dict = dataloader.load_class_dict( class_file=config.news20_class_label_path, class_code_column="ClassCode", class_name_column="ConceptNet" ) # check class label in vocab and glove for class_id in class_dict: class_label = class_dict[class_id] class_label_word_id = vocab.word_to_id(class_label) assert class_label_word_id != vocab.unk_id assert np.sum(glove_mat[class_label_word_id]) != 0 print("Check NaN in csv ...") check_nan_train = dataloader.check_df(config.news20_train_path) check_nan_test = dataloader.check_df(config.news20_test_path) print("Train NaN %s, Test NaN %s" % (check_nan_train, check_nan_test)) assert not check_nan_train assert not check_nan_test train_class_list = dataloader.load_data_class( filename=config.news20_train_path, column="class", ) train_text_seqs = dataloader.load_data_from_text_given_vocab( config.news20_train_path, vocab, config.news20_train_processed_path, column="selected_tfidf", force_process=False # column="selected", force_process=False # column="text", force_process=True ) test_class_list = dataloader.load_data_class( filename=config.news20_test_path, column="class", ) test_text_seqs = dataloader.load_data_from_text_given_vocab( config.news20_test_path, vocab, config.news20_test_processed_path, column="selected_tfidf", force_process=False # column="selected", force_process=False # column="text", force_process=True ) # import playground # playground.tf_idf_document(vocab, glove_mat, train_text_seqs, config.news20_train_path, config.news20_train_path) # playground.tf_idf_document(vocab, glove_mat, test_text_seqs, config.news20_test_path, config.news20_test_path) # exit() # for idx in range(1000, 1010): # print(test_class_list[idx], class_dict[test_class_list[idx]]) # print(test_text_seqs[idx]) # print([vocab.id_to_word(word_id) for word_id in test_text_seqs[idx]]) # print([1 if np.sum(glove_mat[word_id]) else 0 for word_id in test_text_seqs[idx]]) lenlist = [len(text) for text in test_text_seqs] + [len(text) for text in train_text_seqs] print("Avg length of documents: ", np.mean(lenlist)) print("95% length of documents: ", np.percentile(lenlist, 95)) print("90% length of documents: ", np.percentile(lenlist, 90)) print("80% length of documents: ", np.percentile(lenlist, 80)) for i, rgroup in enumerate(random_group): # if i + 1 < config.random_group_start_idx: # continue if i + 1 != 6 and i + 1 != 7: continue max_length = 200 with tf.Graph().as_default() as graph: tl.layers.clear_layers_name() mdl = model_seen.Model4Seen( #TODO: mistake: the model name should be selected_tfidf model_name="seen_selected_tfidf_news20_vwonly_random%d_unseen%s_max%d_cnn" \ % (i + 1, "-".join(str(_) for _ in rgroup[1]), max_length), start_learning_rate=0.0004, decay_rate=0.5, decay_steps=600, max_length=max_length, number_of_seen_classes=len(rgroup[0]) ) # TODO: if unseen_classes are already selected, set randon_unseen_class=False and provide a list of unseen_classes gpu_config = tf.ConfigProto() gpu_config.gpu_options.per_process_gpu_memory_fraction = config.global_gpu_occupation ctl = Controller4Seen( model=mdl, vocab=vocab, class_dict=class_dict, word_embed_mat=glove_mat, random_unseen_class=False, random_unseen_class_list=rgroup[1], base_epoch=-1, gpu_config=gpu_config ) ctl.controller(train_text_seqs, train_class_list, test_text_seqs, test_class_list, train_epoch=20) ctl.controller4test(test_text_seqs, test_class_list, unseen_class_list=ctl.unseen_class, base_epoch=20) ctl.sess.close()
def run_dbpedia(): random_group = dataloader.get_random_group(config.zhang15_dbpedia_class_random_group_path) # DBpedia vocab = dataloader.build_vocabulary_from_full_corpus( # config.zhang15_dbpedia_full_data_path, config.zhang15_dbpedia_vocab_path, column="selected", force_process=False, config.zhang15_dbpedia_full_data_path, config.zhang15_dbpedia_vocab_path, column="text", force_process=False, min_word_count=55 ) glove_mat = dataloader.load_glove_word_vector( config.word_embed_file_path, config.zhang15_dbpedia_word_embed_matrix_path, vocab, force_process=False ) assert np.sum(glove_mat[vocab.start_id]) == 0 assert np.sum(glove_mat[vocab.end_id]) == 0 assert np.sum(glove_mat[vocab.unk_id]) == 0 assert np.sum(glove_mat[vocab.pad_id]) == 0 class_dict = dataloader.load_class_dict( class_file=config.zhang15_dbpedia_class_label_path, class_code_column="ClassCode", class_name_column="ConceptNet" ) print("Check NaN in csv ...") check_nan_train = dataloader.check_df(config.zhang15_dbpedia_train_path) check_nan_test = dataloader.check_df(config.zhang15_dbpedia_test_path) print("Train NaN %s, Test NaN %s" % (check_nan_train, check_nan_test)) assert not check_nan_train assert not check_nan_test train_class_list = dataloader.load_data_class( filename=config.zhang15_dbpedia_train_path, column="class", ) train_text_seqs = dataloader.load_data_from_text_given_vocab( config.zhang15_dbpedia_train_path, vocab, config.zhang15_dbpedia_train_processed_path, column="text", force_process=False # column="selected", force_process=False # column="selected_tfidf", force_process=False # column="selected_tfidf", force_process=False ) test_class_list = dataloader.load_data_class( filename=config.zhang15_dbpedia_test_path, column="class", ) test_text_seqs = dataloader.load_data_from_text_given_vocab( config.zhang15_dbpedia_test_path, vocab, config.zhang15_dbpedia_test_processed_path, column="text", force_process=False # column="selected", force_process=False # column="selected_tfidf", force_process=False # column="selected_tfidf", force_process=False ) # import playground # playground.tf_idf_document(vocab, glove_mat, train_text_seqs, config.zhang15_dbpedia_train_path, config.zhang15_dbpedia_train_path) # playground.tf_idf_document(vocab, glove_mat, test_text_seqs, config.zhang15_dbpedia_test_path, config.zhang15_dbpedia_test_path) # exit() lenlist = [len(text) for text in test_text_seqs] + [len(text) for text in train_text_seqs] print("Avg length of documents: ", np.mean(lenlist)) print("95% length of documents: ", np.percentile(lenlist, 95)) print("90% length of documents: ", np.percentile(lenlist, 90)) print("80% length of documents: ", np.percentile(lenlist, 80)) # exit() for i, rgroup in enumerate(random_group): if i + 1 < config.random_group_start_idx: continue # unseen_percentage = 0.0 max_length = 50 with tf.Graph().as_default() as graph: tl.layers.clear_layers_name() mdl = model_seen.Model4Seen( model_name="seen_full_zhang15_dbpedia_vwonly_random%d_unseen%s_max%d_cnn" \ % (i + 1, "-".join(str(_) for _ in rgroup[1]), max_length), start_learning_rate=0.0004, decay_rate=0.5, decay_steps=10e3, max_length=max_length, number_of_seen_classes=len(rgroup[0]) ) # TODO: if unseen_classes are already selected, set randon_unseen_class=False and provide a list of unseen_classes gpu_config = tf.ConfigProto() gpu_config.gpu_options.per_process_gpu_memory_fraction = config.global_gpu_occupation ctl = Controller4Seen( model=mdl, vocab=vocab, class_dict=class_dict, word_embed_mat=glove_mat, random_unseen_class=False, random_unseen_class_list=rgroup[1], base_epoch=-1, gpu_config=gpu_config ) ctl.controller(train_text_seqs, train_class_list, test_text_seqs, test_class_list, train_epoch=1) # ctl.controller4test(test_text_seqs, test_class_list, unseen_class_list, base_epoch=5) ctl.controller4test(test_text_seqs, test_class_list, unseen_class_list=ctl.unseen_class, base_epoch=1) ctl.sess.close()
column="text", force_process=False, min_word_count=55) glove_mat = dataloader.load_glove_word_vector( config.word_embed_file_path, config.zhang15_dbpedia_word_embed_matrix_path, vocab, force_process=False) assert np.sum(glove_mat[vocab.start_id]) == 0 assert np.sum(glove_mat[vocab.end_id]) == 0 assert np.sum(glove_mat[vocab.unk_id]) == 0 assert np.sum(glove_mat[vocab.pad_id]) == 0 class_dict = dataloader.load_class_dict( class_file=config.zhang15_dbpedia_class_label_path, class_code_column="ClassCode", class_name_column="ConceptNet") print("Check NaN in csv ...") check_nan_train = dataloader.check_df( config.zhang15_dbpedia_train_path) check_nan_test = dataloader.check_df(config.zhang15_dbpedia_test_path) print("Train NaN %s, Test NaN %s" % (check_nan_train, check_nan_test)) assert not check_nan_train assert not check_nan_test train_class_list = dataloader.load_data_class( filename=config.zhang15_dbpedia_train_path, column="class", )
def tf_idf_category(): vocab = dataloader.build_vocabulary_from_full_corpus( config.zhang15_dbpedia_full_data_path, config.zhang15_dbpedia_vocab_path, column="text", force_process=False, min_word_count=55) class_dict = dataloader.load_class_dict( class_file=config.zhang15_dbpedia_class_label_path, class_code_column="ClassCode", class_name_column="ConceptNet") glove_mat = dataloader.load_glove_word_vector( config.word_embed_file_path, config.zhang15_dbpedia_word_embed_matrix_path, vocab, force_process=False) assert np.sum(glove_mat[vocab.start_id]) == 0 assert np.sum(glove_mat[vocab.end_id]) == 0 assert np.sum(glove_mat[vocab.unk_id]) == 0 assert np.sum(glove_mat[vocab.pad_id]) == 0 train_class_list = dataloader.load_data_class( filename=config.zhang15_dbpedia_train_path, column="class", ) train_text_seqs = dataloader.load_data_from_text_given_vocab( config.zhang15_dbpedia_train_path, vocab, config.zhang15_dbpedia_train_processed_path, column="text", force_process=False) print("Combing documents in the same category together ...") all_content_for_each_class_dict = dict() for idx, document in enumerate(train_text_seqs): class_id = train_class_list[idx] if class_id not in all_content_for_each_class_dict: all_content_for_each_class_dict[class_id] = list() all_content_for_each_class_dict[class_id] += document import math total_number_of_category = len(all_content_for_each_class_dict.keys()) occur_of_word_in_category_list = np.zeros(vocab.unk_id + 1) number_of_word_in_each_category_dict_list = dict() print("Counting number of appearance ...") for class_id in all_content_for_each_class_dict: full_text = all_content_for_each_class_dict[class_id] assert class_id not in number_of_word_in_each_category_dict_list number_of_word_in_each_category_dict_list[class_id] = np.zeros( vocab.unk_id + 1) word_set = set() for word_id in full_text: number_of_word_in_each_category_dict_list[class_id][word_id] += 1 if word_id in word_set: continue word_set.add(word_id) occur_of_word_in_category_list[word_id] += 1 print("IDF") idf_list = np.array([ math.log(total_number_of_category / (1 + _)) for _ in occur_of_word_in_category_list ]) print("TF") tf_dict_list = dict() for class_id in number_of_word_in_each_category_dict_list: assert class_id not in tf_dict_list most_freq = np.max(number_of_word_in_each_category_dict_list[class_id]) # tf_dict_list[class_id] = np.array([ 0.5 + 0.5 * _ / most_freq for _ in number_of_word_in_each_category_dict_list[class_id]]) # tf_dict_list[class_id] = np.array([_ / most_freq for _ in number_of_word_in_each_category_dict_list[class_id]]) tf_dict_list[class_id] = np.array([ math.log(1 + _) for _ in number_of_word_in_each_category_dict_list[class_id] ]) print("TFIDF") tfidf_dict_list = dict() for class_id in number_of_word_in_each_category_dict_list: assert class_id not in tfidf_dict_list tfidf_dict_list[class_id] = tf_dict_list[class_id] * idf_list # manually set some special words to 0 for word_id in range(vocab.unk_id + 1): if np.sum(glove_mat[word_id]) == 0: tfidf_dict_list[class_id][word_id] = 0 print("samples of top indicative words ...") for class_id in tfidf_dict_list: tfidf_scores = tfidf_dict_list[class_id] k = 10 topk = tfidf_scores.argsort()[-k:][::-1] print(class_dict[class_id], [vocab.id_to_word(idx) for idx in topk]) print(class_dict[class_id], [tfidf_scores[idx] for idx in topk]) with open(config.zhang15_dbpedia_dir + "TFIDF_class.pkl", "wb") as f: pickle.dump(tfidf_dict_list, f)
def collect_full_and_split(): class_dict_concept = dataloader.load_class_dict( class_file=config.chen14_elec_class_label_path, class_code_column="ConceptNet", class_name_column="ClassCode", ) class_dict_label = dataloader.load_class_dict( class_file=config.chen14_elec_class_label_path, class_code_column="ClassLabel", class_name_column="ConceptNet", ) class_dict_count = dataloader.load_class_dict( class_file=config.chen14_elec_class_label_path, class_code_column="ClassLabel", class_name_column="Count", ) train_df = pd.DataFrame(columns=["class", "text"]) test_df = pd.DataFrame(columns=["class", "text"]) df = pd.DataFrame(columns=["class", "text"]) index_df = 0 for idx, class_dir in enumerate(class_dir_list): filename = "%s%s/%s" % (home_dir, class_dir, class_dir) print(idx, filename) vocab_dict = dict() with open(filename + ".vocab", 'r', encoding="utf8", errors="ignore") as f: for line in f: word_id = line.split(":")[0] word = line.split(":")[1].replace("\n", "") assert word_id not in vocab_dict vocab_dict[word_id] = word order = list(range(class_dict_count[class_dir])) random.shuffle(order) test_order = order[:int(class_dict_count[class_dir] * 0.3)] with open(filename + ".docs", 'r', encoding="utf8", errors="ignore") as f: for lidx, line in enumerate(f): content = line.replace("\n", "") content = content.split(" ") content = ' '.join([vocab_dict[c] for c in content]) if lidx in test_order: test_df.loc[index_df] = [ class_dict_concept[class_dict_label[class_dir]], content ] else: train_df.loc[index_df] = [ class_dict_concept[class_dict_label[class_dir]], content ] df.loc[index_df] = [ class_dict_concept[class_dict_label[class_dir]], content ] index_df += 1 # print(test_df) # print(train_df) # print(df) # exit() train_df = train_df.reset_index(drop=True) test_df = test_df.reset_index(drop=True) df = df.reset_index(drop=True) train_df.to_csv("../data/chen14/clean_elec/train.csv") test_df.to_csv("../data/chen14/clean_elec/test.csv") df.to_csv("../data/chen14/clean_elec/full.csv")