def analyze_wv_vocab_coverage(wv_path, global_train_path="../data/atec/training.csv", min_count=2): """分析词向量与训练数据间的词汇覆盖情况 """ wv_name = wv_path.split("/")[-1] level_type = wv_name.split("_")[-1].split("-")[0] assert level_type in ["char", "word", "wc"] file_type = wv_path.split("/")[-2] assert file_type in ["glove", "word2vec", "fasttext"] if file_type == "word2vec": model = models.Word2Vec.load(wv_path) else: model = models.KeyedVectors.load_word2vec_format(wv_path, binary=False) wv_vocab = model.wv.vocab raw_data = read_cut_file(global_train_path, True) if level_type in ["word", "wc"]: sent1, sent2 = raw_data["sent1w"], raw_data["sent2w"] analyze_wv_vocab_coverage_helper( sent1, sent2, min_count, wv_vocab, "-".join(wv_path.split("-")[0:-1]).replace("wc", "word") + "-" + str(min_count) + "_vc.png") if level_type in ["char", "wc"]: sent1, sent2 = raw_data["sent1c"], raw_data["sent2c"] analyze_wv_vocab_coverage_helper( sent1, sent2, min_count, wv_vocab, "-".join(wv_path.split("-")[0:-1]).replace("wc", "char") + "-" + str(min_count) + "_vc.png")
def __init__(self, filePath,with_label,dictPathW,dictPathC,modeC, is_train=False): raw_data = read_cut_file(filePath, with_label, dictPathW,dictPathC,modeC) id, la = raw_data['id'], raw_data['label'] s1w, s2w, s1wl, s2wl = raw_data["sent1w"], raw_data['sent2w'], raw_data['sent1w_len'], raw_data['sent2w_len'] s1c, s2c, s1cl, s2cl = raw_data["sent1c"], raw_data['sent2c'], raw_data['sent1c_len'], raw_data['sent2c_len'] if is_train: # 对调句1和句2进行数据扩充 self.df = pd.DataFrame({"id": id + id, "label": la + la, "sen1w": s1w + s2w, "sen2w": s2w + s1w, "sen1w_len": s1wl + s2wl, "sen2w_len": s2wl + s1wl, "sen1c": s1c + s2c, "sen2c": s2c + s1c, "sen1c_len": s1cl + s2cl, "sen2c_len": s2cl + s1cl, }) else: self.df = pd.DataFrame({"id": id, "label": la, "sen1w": s1w, "sen2w": s2w, "sen1w_len": s1wl, "sen2w_len": s2wl, "sen1c": s1c, "sen2c": s2c, "sen1c_len": s1cl, "sen2c_len": s2cl}) if with_label: df_pos = self.df[self.df["label"] == 1] df_neg = self.df[self.df["label"] == 0] pn_rate_orig = len(df_pos) / float(len(df_neg)) print("pn_rate_orig: %f"%pn_rate_orig) self.total_size = len(self.df) self.cursor = 0 self.loop = -1 self.shuffle() self.modeC=modeC
def createLocalWCDict(trainFile, min_count_w=2, min_count_c=2, global_dict_path="data/atec/training-2-2.json"): """ 根据训练数据的不同生成不同的动态序号-词/字、词/字-序号字典 """ global_dict = loadDict(global_dict_path) global_w_v2i = global_dict["word"]["v2i"] global_c_v2i = global_dict["char"]["v2i"] data = read_cut_file(trainFile, with_label=True) sentences = data["sent1w"] + data["sent2w"] sentences_c = data["sent1c"] + data["sent2c"] savePath = os.path.join(os.path.dirname(trainFile), os.path.basename(trainFile).split(".")[0]) words, chars = {}, {} for sentence in sentences: for word in sentence: try: words[word] += 1 except: words[word] = 1 for sentence in sentences_c: for char in sentence: try: chars[char] += 1 except: chars[char] = 1 print("Size for text words: ", len(words.keys())) print("Size for global words: ", len(global_w_v2i.keys())) print("Size for text chars: ", len(chars.keys())) print("Size for global chars: ", len(global_c_v2i.keys())) vocab = [w for w in words.keys() if words[w] >= min_count_w] vocab_c = [c for c in chars.keys() if chars[c] >= min_count_c] vocab = ['<pad>'] + ['<unk>'] + vocab vocab_c = ['<pad>'] + ['<unk>'] + vocab_c v2i, i2v = {}, {} for word in vocab: id = global_w_v2i[word] v2i[word] = id i2v[id] = word print("id for <pad>: ", v2i['<pad>']) print("id for <unk>: ", v2i['<unk>']) print("total vocab size: ", len(v2i.keys())) w_dict = {"v2i": v2i, "i2v": i2v} v2i, i2v = {}, {} for word in vocab_c: id = global_c_v2i[word] v2i[word] = id i2v[id] = word print("id for <pad>: ", v2i['<pad>']) print("id for <unk>: ", v2i['<unk>']) print("total vocab size: ", len(v2i.keys())) c_dict = {"v2i": v2i, "i2v": i2v} d = {"word": w_dict, "char": c_dict} saveDict( d, savePath + "-" + str(min_count_w) + "-" + str(min_count_c) + ".json") return d
def get_corpus(file_path="atec/training.csv", corpus_path="atec/atec"): """ 根据给定数据生成字级和词级语料库 """ target_char = codecs.open(corpus_path + "_char", 'w', encoding='utf-8') target_word = codecs.open(corpus_path + "_word", 'w', encoding='utf-8') raw_data = read_cut_file(file_path, True) w1, w2, c1, c2 = raw_data["sent1w"], raw_data["sent2w"], raw_data[ "sent1c"], raw_data["sent2c"] target_char.writelines([" ".join(c) + "\n" for c in c1]) target_char.writelines([" ".join(c) + "\n" for c in c2]) target_word.writelines([" ".join(w) + "\n" for w in w1]) target_word.writelines([" ".join(w) + "\n" for w in w2]) print('well done.') target_char.close() target_word.close()
def createGlobalWCDict(trainFile="data/atec/training.csv", min_count_w=2, min_count_c=2): """ 记录全局动态字/词-序号、序号-字/词对应关系 """ data = read_cut_file(trainFile, with_label=True) sentences = data["sent1w"] + data["sent2w"] sentences_c = data["sent1c"] + data["sent2c"] savePath = os.path.join(os.path.dirname(trainFile), os.path.basename(trainFile).split(".")[0]) words = {} chars = {} for sentence in sentences: for word in sentence: try: words[word] += 1 except: words[word] = 1 for sentence in sentences_c: for char in sentence: try: chars[char] += 1 except: chars[char] = 1 vocab = [w for w in words.keys() if words[w] >= min_count_w] vocab_c = [c for c in chars.keys() if chars[c] >= min_count_c] int_to_vocab = dict(enumerate(['<pad>'] + ['<unk>'] + vocab)) vocab_to_int = dict(zip(int_to_vocab.values(), int_to_vocab.keys())) print("id for <pad>: ", vocab_to_int['<pad>']) print("id for <unk>: ", vocab_to_int['<unk>']) print("total vocab size: ", len(list(int_to_vocab.keys()))) word_dict = {"i2v": int_to_vocab, "v2i": vocab_to_int} int_to_vocab = dict(enumerate(['<pad>'] + ['<unk>'] + vocab_c)) vocab_to_int = dict(zip(int_to_vocab.values(), int_to_vocab.keys())) print("id for <pad>: ", vocab_to_int['<pad>']) print("id for <unk>: ", vocab_to_int['<unk>']) print("total vocab size: ", len(list(int_to_vocab.keys()))) char_dict = {"i2v": int_to_vocab, "v2i": vocab_to_int} cw_dict = {"char": char_dict, "word": word_dict} saveDict( cw_dict, savePath + "-" + str(min_count_w) + "-" + str(min_count_c) + ".json")
def label_distribution(trainFile="atec/training.csv"): """ 分析训练数据中标签分布情况 """ labels = read_cut_file(file_path=trainFile, with_label=True)["label"] neg_count = labels.count(0) pos_count = labels.count(1) assert neg_count + pos_count == len(labels) counts = [neg_count, pos_count] labels = ["不同义", "同义"] fig = plt.figure(figsize=(9, 9)) # 画饼图(数据,数据对应的标签,百分数保留两位小数点) plt.pie(counts, labels=labels, autopct='%1.2f%%') plt.title("标签分布", bbox={'facecolor': '0.6', 'pad': 5}) plt.show() savePath = trainFile.split(".")[0] + "_ld.png" fig.savefig(savePath) plt.close()
def sentence_length_distribution(trainFile="atec/training.csv"): """ 分析训练数据中句子长度分布 """ raw_data = read_cut_file(file_path=trainFile, with_label=True) df = pd.DataFrame(raw_data) level = ["w", "c"] for l in level: s1 = "sent1" + l + "_len" print(df[s1].describe()) s2 = "sent2" + l + "_len" print(df[s2].describe()) df_ = pd.DataFrame({s1: df[s1], s2: df[s2]}) fig = plt.figure(figsize=(32, 18)) df_.boxplot() plt.legend() plt.show() fig.savefig(trainFile.replace(".csv", "_sl_" + l + ".png"))
def __init__(self, filePath,with_label,dictPathW,dictPathC,modeC, is_train=False,num_buckets=5): raw_data = read_cut_file(filePath, with_label, dictPathW,dictPathC,modeC) id, la = raw_data['id'], raw_data['label'] s1w, s2w, s1wl, s2wl = raw_data["sent1w"], raw_data['sent2w'], raw_data['sent1w_len'], raw_data['sent2w_len'] s1c, s2c, s1cl, s2cl = raw_data["sent1c"], raw_data['sent2c'], raw_data['sent1c_len'], raw_data['sent2c_len'] if is_train: # 对调句1和句2进行数据扩充 self.df = pd.DataFrame({"id": id + id, "label": la + la, "sen1w": s1w + s2w, "sen2w": s2w + s1w, "sen1w_len": s1wl + s2wl, "sen2w_len": s2wl + s1wl, "sen1c": s1c + s2c, "sen2c": s2c + s1c, "sen1c_len": s1cl + s2cl, "sen2c_len": s2cl + s1cl, }) else: self.df = pd.DataFrame({"id": id, "label": la, "sen1w": s1w, "sen2w": s2w, "sen1w_len": s1wl, "sen2w_len": s2wl, "sen1c": s1c, "sen2c": s2c, "sen1c_len": s1cl, "sen2c_len": s2cl}) if with_label: df_pos = self.df[self.df["label"] == 1] df_neg = self.df[self.df["label"] == 0] pn_rate_orig = len(df_pos) / float(len(df_neg)) print("pn_rate_orig: %f"%pn_rate_orig) df = self.df.sort_values("sen1w_len").reset_index(drop=True) self.total_size = len(df) part_size = self.total_size // num_buckets self.dfs = [] for i in range(num_buckets): self.dfs.append(df.ix[i * part_size:(i + 1) * part_size - 1]) self.dfs[num_buckets-1]=self.dfs[num_buckets - 1].append(df.ix[num_buckets * part_size:self.total_size - 1]) self.num_buckets = num_buckets self.cursor = np.array([0] * num_buckets) self.p_list = [1 / self.num_buckets] * self.num_buckets self.loop = -1 self.shuffle() self.modeC=modeC
def __init__(self, filePath,with_label,dictPathW,dictPathC,modeC, is_train=False,max_len_w=20,max_len_c=40): raw_data = read_cut_file(filePath, with_label, dictPathW,dictPathC,modeC) id, la = raw_data['id'], raw_data['label'] s1w, s2w, s1wl, s2wl = raw_data["sent1w"], raw_data['sent2w'], raw_data['sent1w_len'], raw_data['sent2w_len'] s1c, s2c, s1cl, s2cl = raw_data["sent1c"], raw_data['sent2c'], raw_data['sent1c_len'], raw_data['sent2c_len'] if is_train: # 对调句1和句2进行数据扩充 self.df = pd.DataFrame({"id": id + id, "label": la + la, "sen1w": s1w + s2w, "sen2w": s2w + s1w, "sen1w_len": s1wl + s2wl, "sen2w_len": s2wl + s1wl, "sen1c": s1c + s2c, "sen2c": s2c + s1c, "sen1c_len": s1cl + s2cl, "sen2c_len": s2cl + s1cl, }) else: self.df = pd.DataFrame({"id": id, "label": la, "sen1w": s1w, "sen2w": s2w, "sen1w_len": s1wl, "sen2w_len": s2wl, "sen1c": s1c, "sen2c": s2c, "sen1c_len": s1cl, "sen2c_len": s2cl}) if with_label: df_pos = self.df[self.df["label"] == 1] df_neg = self.df[self.df["label"] == 0] pn_rate_orig = len(df_pos) / float(len(df_neg)) print("pn_rate_orig: %f"%pn_rate_orig) self.total_size = len(self.df) res1 = np.zeros(shape=[self.total_size, max_len_w], dtype=np.int32) res2 = np.zeros(shape=[self.total_size, max_len_w], dtype=np.int32) self.df["sen1w_len"] = min_ele_array(self.df["sen1w_len"].values, max_len_w) self.df["sen2w_len"] = min_ele_array(self.df["sen2w_len"].values, max_len_w) if modeC == 0: res1_c = np.zeros(shape=[self.total_size, max_len_c], dtype=np.int32) res2_c = np.zeros(shape=[self.total_size, max_len_c], dtype=np.int32) self.df["sen1c_len"] = min_ele_array(self.df["sen1c_len"].values, max_len_c) self.df["sen2c_len"] = min_ele_array(self.df["sen2c_len"].values, max_len_c) for idx in range(self.total_size): # 少的pad。 res1[idx, :self.df["sen1w_len"].values[idx]] = \ self.df["sen1w"].values[idx][:self.df["sen1w_len"].values[idx]] res2[idx, :self.df["sen2w_len"].values[idx]] = \ self.df["sen2w"].values[idx][:self.df["sen2w_len"].values[idx]] res1_c[idx, :self.df["sen1c_len"].values[idx]] = \ self.df["sen1c"].values[idx][:self.df["sen1c_len"].values[idx]] res2_c[idx, :self.df["sen2c_len"].values[idx]] = \ self.df["sen2c"].values[idx][:self.df["sen2c_len"].values[idx]] else: res1_c = np.zeros(shape=[self.total_size, max_len_w, modeC], dtype=np.int32) res2_c = np.zeros(shape=[self.total_size, max_len_w, modeC], dtype=np.int32) for idx in range(self.total_size): # 少的pad。 res1[idx, :self.df["sen1w_len"].values[idx]] = \ self.df["sen1w"].values[idx][:self.df["sen1w_len"].values[idx]] res2[idx, :self.df["sen2w_len"].values[idx]] = \ self.df["sen2w"].values[idx][:self.df["sen2w_len"].values[idx]] for jdx in range(self.df["sen1w_len"].values[idx]): res1_c[idx, jdx] = self.df["sen1c"].values[idx][jdx] for jdx in range(self.df["sen2w_len"].values[idx]): res2_c[idx, jdx] = self.df["sen2c"].values[idx][jdx] self.df["sen1w"]=res1.tolist() self.df["sen2w"]=res2.tolist() self.df["sen1c"]=res1_c.tolist() self.df["sen2c"]=res2_c.tolist() self.cursor = 0 self.loop = -1 self.shuffle() self.modeC=modeC
def evaluate(self, validFile=None, dictPath=None, load_path=None): """ :param dictPath: 模型训练数据对应的local dict path """ assert validFile is not None and dictPath is not None and load_path is not None if self.config.wv_config["train_w"]: dictPathW = dictPath else: dictPathW = self.config.global_dict if self.config.wv_config["train_c"]: dictPathC = dictPath else: dictPathC = self.config.global_dict val_generator = DataIterator(validFile, True, dictPathW, dictPathC, self.config.modeC) load_dir = load_path if os.path.isdir(load_path) else os.path.dirname( load_path) log_dir = ensure_dir_exist(load_dir.replace("checkpoints", "logs")) logger = my_logger(log_dir + "/log_evaluate.txt") logger.info("Evaluating with file: %s, local dict: %s..." % (validFile, dictPath)) os.environ['CUDA_VISIBLE_DEVICES'] = str(0) config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.6 with tf.Session(config=config, graph=self.graph) as sess: logger.info("Loading model...") saver = tf.train.Saver(self.var_list) if os.path.isdir(load_path): ckpt = tf.train.get_checkpoint_state(load_path) saver.restore(sess, ckpt.model_checkpoint_path) global_step = ckpt.model_checkpoint_path.split("-")[-1] else: saver.restore(sess, load_path) global_step = load_path.split("-")[-1] logger.info("Loading successfully, loading epoch is %s" % global_step) batch = val_generator.next(1024, need_all=True) res = {} while val_generator.loop == 0: pos_prob, pred = sess.run( [self.pos_prob, self.predicted], feed_dict=self._get_valid_feed_dict(batch)) for (id, p, la, pr) in zip(batch["id"], pos_prob, batch["label"], pred): res[id] = [float(p), int(la), int(pr)] batch = val_generator.next(1024, need_all=True) res = [[int(key), float(value[0]), int(value[1]), int(value[2])] for (key, value) in res.items()] tmp = pd.DataFrame(res, columns=["id", "pos_prob", "label", "pred"]) tmp = tmp.sort_values(by="id", axis=0, ascending=True) id = np.asarray(tmp["id"].values, dtype=np.int) id_v = read_cut_file(validFile, True)["id"] assert np.allclose(np.sort(id), np.array(id_v)), "Inconsistent indices!" for t in np.arange(0, 1, 0.05): pred = np.greater_equal(tmp["pos_prob"].values, np.asarray([t])) pred = np.asarray(pred, dtype=np.int) if t == 0.5: assert np.allclose( pred, tmp["pred"].values), "Inconsistent prediction!" f1 = f1_score(y_pred=pred, y_true=tmp["label"]) acc = accuracy_score(y_pred=pred, y_true=tmp["label"]) pre = precision_score(y_pred=pred, y_true=tmp["label"]) rec = recall_score(y_pred=pred, y_true=tmp["label"]) logger.info( "Threshold: %02f, F1: %.4f, A: %.4f, P: %.4f, R: %.4f" % (t, f1, acc, pre, rec))