def add_pinlei_tag_yyh(self): LogInfo.begin_track("Begin adding tags for pinleis...") fin = codecs.open(self.root_fp + "/yyh_w2v_train.txt", 'r', encoding='utf-8') fout = codecs.open(self.root_fp + "/yyh_w2v_train.txt.pinlei_tag", 'w', encoding='utf-8') cnt = 0 for line in fin: spt = line.strip().split() new_line = "" i = 0 while i < len(spt): if i + 3 < len(spt): str4 = spt[i] + spt[i + 1] + spt[i + 2] + spt[3] if str4 in self.pinlei_set: LogInfo.logs("Found 4-term pinlei [%s|%s|%s|%s]", spt[i], spt[i + 1], spt[i + 2], spt[i + 3]) new_line += "[[" + str4 + "]] " i += 4 continue if i + 2 < len(spt): str3 = spt[i] + spt[i + 1] + spt[i + 2] if str3 in self.pinlei_set: # LogInfo.logs("Found 3-term pinlei [%s|%s|%s]", # spt[i], spt[i+1], spt[i+2]) new_line += "[[" + str3 + "]] " i += 3 continue if i + 1 < len(spt): str2 = spt[i] + spt[i + 1] if str2 in self.pinlei_set: # LogInfo.logs("Found 2-term pinlei [%s|%s]", # spt[i], spt[i+1]) new_line += "[[" + str2 + "]] " i += 2 continue if spt[i] in self.pinlei_set: # LogInfo.logs("Found pinlei [%s]", spt[i]) new_line += "[[" + spt[i] + "]] " i += 1 continue new_line += spt[i] + " " i += 1 fout.write(new_line + "\n") cnt += 1 if cnt < 5: LogInfo.logs("res ==> (%s)", new_line) LogInfo.show_line(cnt, 100000) fin.close() fout.close() LogInfo.end_track("Pinlei tags added.")
def load(self, data_file, encoding): LogInfo.begin_track("Loading data from %s...", data_file) if os.path.isfile(data_file): LogInfo.begin_track("[Exist] Loading from %s...", data_file) query_idxs, query_lens, labels, intents, link_masks, entity_idxs \ = list(), list(), list(), list(), list(), list() cnt = 0 with codecs.open(data_file, 'r', encoding=encoding) as fin: for line in fin: spt = line.strip().split("\t") query_idxs.append([int(idx) for idx in spt[0].split(" ")]) query_lens.append(int(spt[1])) labels.append([int(idx) for idx in spt[2].split(" ")]) intents.append(int(spt[3])) link_masks.append([int(idx) for idx in spt[4].split(" ")]) entity_idxs.append([int(idx) for idx in spt[5].split(" ")]) cnt += 1 LogInfo.show_line(cnt, 1000000) LogInfo.end_track("Max_seq_len = %d.", self.max_seq_len) else: txt_data_file = data_file + ".name" LogInfo.begin_track("[Not Exist] Loading from %s...", txt_data_file) query_idxs, query_lens, labels, intents, link_masks, entity_idxs \ = list(), list(), list(), list(), list(), list() cnt = 0 fout = codecs.open(data_file, 'w', encoding=encoding) with codecs.open(txt_data_file, 'r', encoding=encoding) as fin: for line in fin: query_idx, query_len, label, intent, link_mask, entity_idx\ = self.decode_line(line) fout.write(" ".join([str(x) for x in query_idx]) + "\t" + str(query_len) + "\t" + " ".join([str(x) for x in label]) + "\t" + str(intent) + "\t" + " ".join([str(x) for x in link_mask]) + "\t" + " ".join([str(x) for x in entity_idx]) + "\n") query_idxs.append(query_idx) query_lens.append(query_len) labels.append(label) intents.append(intent) link_masks.append(link_mask) entity_idxs.append(entity_idx) cnt += 1 LogInfo.show_line(cnt, 1000000) fout.close() LogInfo.logs("Write into %s.", data_file) LogInfo.end_track("Max_seq_len = %d.", self.max) self.data = list( zip(query_idxs, query_lens, labels, intents, link_masks, entity_idxs)) self.data_size = len(self.data) LogInfo.end_track("Loaded. Size: %d.", self.data_size)
def load_vocab_embedding(self, embedding_file, encoding): LogInfo.begin_track("Loading embeddings from %s...", embedding_file) vocab_embedding = len(self.vocab_index_dict) * [None] with codecs.open(embedding_file, 'r', encoding=encoding) as fin: count = 0 for line in fin: strs = line.split() embedding = [float(strs[i].strip()) for i in range(1, len(strs))] vocab_embedding[self.vocab_index_dict[strs[0].strip()]] = embedding count += 1 LogInfo.show_line(count, 50000) assert count == len(vocab_embedding) self.vocab_embedding = np.asarray(vocab_embedding) LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
def load_vocab_name(self, vocab_file, encoding): LogInfo.begin_track("Loading vocab from %s...", vocab_file) self.vocab_size = 0 self.index_vocab_dict.clear() self.vocab_index_dict.clear() with codecs.open(vocab_file, 'r', encoding=encoding) as fin: index = 0 for line in fin: self.vocab_index_dict[line.strip()] = index self.index_vocab_dict.append(line.strip()) index += 1 LogInfo.show_line(index, 50000) self.vocab_size = index LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
def load(self, data_file, encoding): LogInfo.begin_track("Loading data from %s...", data_file) context_idxs, context_seqs, pinlei_idxs = list(), list(), list() cnt = 0 with codecs.open(data_file, 'r', encoding=encoding) as fin: for line in fin: context_idx, context_seq, pinlei_idx = self.decode_line(line) context_idxs.append(context_idx) context_seqs.append(context_seq) pinlei_idxs.append(pinlei_idx) cnt += 1 LogInfo.show_line(cnt, 10000) self.data = list(zip(context_idxs, context_seqs, pinlei_idxs)) self.data_size = len(self.data) LogInfo.end_track()
def load_vocab(self, vocab_file, embedding_dim, encoding): LogInfo.begin_track("Loading vocab from %s...", vocab_file) self.vocab_size = 0 self.index_vocab_dict.clear() self.vocab_index_dict.clear() self.vocab_embedding.clear() with codecs.open(vocab_file, 'r', encoding=encoding) as fin: index = 0 # 0 embedding for not-found query term self.vocab_index_dict["[[NULL]]"] = index self.index_vocab_dict.append("[[NULL]]") self.vocab_embedding.append([0.0 for _ in range(embedding_dim)]) index += 1 for line in fin: spt = line.strip().split() self.vocab_index_dict[spt[0]] = index self.index_vocab_dict.append(spt[0]) embedding = [float(spt[i].strip()) for i in range(1, len(spt))] self.vocab_embedding.append(embedding) index += 1 LogInfo.show_line(index, 50000) self.vocab_size = len(self.vocab_embedding) self.vocab_embedding = np.array(self.vocab_embedding) LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
def load_data(self): """ load data from files :return: """ LogInfo.begin_track("Loading data...") # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \ # open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine: with open("/home/yuchen/CppFiles/Causal/sync_wdFin_iter200.txt") as finc, \ open("/home/yuchen/CppFiles/Causal/syneneg_wdFin_iter200.txt") as fine: cnt = 0 for linec, linee in zip(finc, fine): cnt += 1 LogInfo.show_line(cnt, 100000) sptc = linec.strip().split() spte = linee.strip().split() wordc = sptc[0] worde = spte[0] try: vecc = map(lambda x: float(x), sptc[1:]) vece = map(lambda x: float(x), spte[1:]) self.sync[wordc] = vecc self.syne_neg[worde] = vece except ValueError: LogInfo.logs("[error] %s | %s", sptc[0:3], spte[0:3]) continue LogInfo.logs("[log] sync/syneneg cause/effect vectors loaded (%d/%d).", len(self.sync), len(self.syne_neg)) with open("/home/yuchen/CppFiles/Causal/syncneg_wdFin_iter200.txt") as finc, \ open("/home/yuchen/CppFiles/Causal/syne_wdFin_iter200.txt") as fine: # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \ # open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine: cnt = 0 for linec, linee in zip(finc, fine): cnt += 1 LogInfo.show_line(cnt, 100000) sptc = linec.strip().split() spte = linee.strip().split() wordc = sptc[0] worde = spte[0] try: vecc = map(lambda x: float(x), sptc[1:]) vece = map(lambda x: float(x), spte[1:]) self.sync_neg[wordc] = vecc self.syne[worde] = vece except ValueError: LogInfo.logs("[error] %s | %s", sptc[0:3], spte[0:3]) continue LogInfo.logs("[log] syncneg/syne cause/effect vectors loaded (%d/%d).", len(self.sync_neg), len(self.syne)) # NN, JJ, VB with open("/home/yuchen/data/copa_phr.txt") as fin: for i in range(1000): raw_sentence = fin.readline() raw_option1 = fin.readline() raw_option2 = fin.readline() sentence = map(lambda x: x.split(':')[1], raw_sentence.strip().split()) option1 = map(lambda x: x.split(':')[1], raw_option1.strip().split()) option2 = map(lambda x: x.split(':')[1], raw_option2.strip().split()) self.copa_data.append([sentence, option1, option2]) LogInfo.logs("[log] copa dataset loaded (%d).", len(self.copa_data)) with open("/home/yuchen/data/copa_label.txt") as fin: for line in fin: spt = line.strip().split('\t') self.copa_ground.append([spt[1], int(spt[2])]) LogInfo.logs("[log] copa ground truth loaded (%d).", len(self.copa_ground)) LogInfo.end_track()
def load_data(self): """ load data from files :return: """ LogInfo.begin_track("Loading data...") # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \ # open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine: with open("/home/yuchen/CppFiles/Causal/copy_sync_half_200_iter100.txt") as finc, \ open("/home/yuchen/CppFiles/Causal/copy_syneneg_half_200_iter100.txt") as fine: cnt = 0 for linec, linee in zip(finc, fine): cnt += 1 LogInfo.show_line(cnt, 100000) sptc = linec.strip().split() spte = linee.strip().split() wordc = sptc[0] worde = spte[0] vecc = map(lambda x: float(x), sptc[1:]) vece = map(lambda x: float(x), spte[1:]) self.sync[wordc] = vecc self.syne_neg[worde] = vece LogInfo.logs("[log] sync/syneneg cause/effect vectors loaded (%d/%d).", len(self.sync), len(self.syne_neg)) with open("/home/yuchen/CppFiles/Causal/copy_syncneg_half_200_iter100.txt") as finc, \ open("/home/yuchen/CppFiles/Causal/copy_syne_half_200_iter100.txt") as fine: # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \ # open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine: cnt = 0 for linec, linee in zip(finc, fine): cnt += 1 LogInfo.show_line(cnt, 100000) sptc = linec.strip().split() spte = linee.strip().split() wordc = sptc[0] worde = spte[0] vecc = map(lambda x: float(x), sptc[1:]) vece = map(lambda x: float(x), spte[1:]) self.sync_neg[wordc] = vecc self.syne[worde] = vece LogInfo.logs("[log] syncneg/syne cause/effect vectors loaded (%d/%d).", len(self.sync_neg), len(self.syne)) # NN, JJ, VB with open("/home/yuchen/data/copa_lem.txt") as fin: for i in range(1000): raw_sentence = fin.readline() raw_option1 = fin.readline() raw_option2 = fin.readline() sentence = list() option1 = list() option2 = list() for word in raw_sentence.strip().split(): if word.startswith('NN') or word.startswith( 'JJ') or word.startswith('VB'): sentence.append(word.split(':')[1]) for word in raw_option1.strip().split(): if word.startswith('NN') or word.startswith( 'JJ') or word.startswith('VB'): option1.append(word.split(':')[1]) for word in raw_option2.strip().split(): if word.startswith('NN') or word.startswith( 'JJ') or word.startswith('VB'): option2.append(word.split(':')[1]) self.copa_data.append([sentence, option1, option2]) LogInfo.logs("[log] copa dataset loaded (%d).", len(self.copa_data)) with open("/home/yuchen/data/copa_label.txt") as fin: for line in fin: spt = line.strip().split('\t') self.copa_ground.append([spt[1], int(spt[2])]) LogInfo.logs("[log] copa ground truth loaded (%d).", len(self.copa_ground)) LogInfo.end_track()
def process_query(self): LogInfo.begin_track("Begin adding tags for queries...") fin = codecs.open(self.root_fp + "/query.txt", 'r', encoding='utf-8') fout = codecs.open(self.root_fp + "/query_label.txt", 'w', encoding='utf-8') cnt = 0 for line in fin: spt = line.strip().split() new_line = "" context = "" label = set() i = 0 while i < len(spt): if i + 4 < len(spt): str5 = spt[i] + spt[i + 1] + spt[i + 2] + spt[i + 3] + spt[i + 4] if str5 in self.pinlei_set: LogInfo.logs("Found 5-term pinlei [%s|%s|%s|%s|%s]", spt[i], spt[i + 1], spt[i + 2], spt[i + 3], spt[i + 4]) label.add(str5) new_line += "[[" + str5 + "]] " i += 5 continue if i + 3 < len(spt): str4 = spt[i] + spt[i + 1] + spt[i + 2] + spt[i + 3] if str4 in self.pinlei_set: LogInfo.logs("Found 4-term pinlei [%s|%s|%s|%s]", spt[i], spt[i + 1], spt[i + 2], spt[i + 3]) label.add(str4) new_line += "[[" + str4 + "]] " i += 4 continue if i + 2 < len(spt): str3 = spt[i] + spt[i + 1] + spt[i + 2] if str3 in self.pinlei_set: LogInfo.logs("Found 3-term pinlei [%s|%s|%s]", spt[i], spt[i + 1], spt[i + 2]) label.add(str3) new_line += "[[" + str3 + "]] " i += 3 continue if i + 1 < len(spt): str2 = spt[i] + spt[i + 1] if str2 in self.pinlei_set: # LogInfo.logs("Found 2-term pinlei [%s|%s]", # spt[i], spt[i+1]) label.add(str2) new_line += "[[" + str2 + "]] " i += 2 continue if spt[i] in self.pinlei_set: # LogInfo.logs("Found pinlei [%s]", spt[i]) label.add(spt[i]) new_line += "[[" + spt[i] + "]] " i += 1 continue context += spt[i] + " " new_line += spt[i] + " " i += 1 if len(label) != 0: ret = new_line.strip() + "\t" + \ context.strip() + "\t" + \ "\t".join(label) + "\n" else: ret = new_line.strip() + "\n" fout.write(ret) cnt += 1 if cnt < 5: LogInfo.logs("res ==> (%s)", ret.strip()) LogInfo.show_line(cnt, 100000) fin.close() fout.close() LogInfo.end_track("Query processed.")
fb_path = "/home/kangqi/Freebase/Transform" LogInfo.begin_track("Loading wiki-fb entity map...") wiki_fb_map = dict() cnt = 0 with open(fb_path + "/GS-cleanWiki-triple.txt") as fin: for line in fin: spt = line.strip().split('\t') if len(spt) < 3: continue fb_ent = spt[0] wiki_ent = spt[2].split('/wiki/')[1][:-1] wiki_ent = wiki_ent.lower().replace('_', ' ') wiki_fb_map[wiki_ent] = fb_ent cnt += 1 LogInfo.show_line(cnt, 500000) LogInfo.end_track("%d pairs in total", cnt) LogInfo.begin_track("Loading fb entity pop...") fb_ent_pop_map = dict() cnt = 0 with open("/home/xusheng/freebase/top5m.mid") as fin: for line in fin: spt = line.strip().split('\t') if len(spt) < 2: continue ent = spt[0] pop = int(spt[1]) fb_ent_pop_map[ent] = pop cnt += 1 LogInfo.show_line(cnt, 500000)