Python read_cut_file 예제들, data.data_utils.read_cut_file Python 예제들

예제 #1

0

파일 보기

파일: wv_utils.py 프로젝트: wslc1314/atec_nlp_sim

def analyze_wv_vocab_coverage(wv_path,
                              global_train_path="../data/atec/training.csv",
                              min_count=2):
    """分析词向量与训练数据间的词汇覆盖情况
    """
    wv_name = wv_path.split("/")[-1]
    level_type = wv_name.split("_")[-1].split("-")[0]
    assert level_type in ["char", "word", "wc"]
    file_type = wv_path.split("/")[-2]
    assert file_type in ["glove", "word2vec", "fasttext"]
    if file_type == "word2vec":
        model = models.Word2Vec.load(wv_path)
    else:
        model = models.KeyedVectors.load_word2vec_format(wv_path, binary=False)
    wv_vocab = model.wv.vocab

    raw_data = read_cut_file(global_train_path, True)
    if level_type in ["word", "wc"]:
        sent1, sent2 = raw_data["sent1w"], raw_data["sent2w"]
        analyze_wv_vocab_coverage_helper(
            sent1, sent2, min_count, wv_vocab,
            "-".join(wv_path.split("-")[0:-1]).replace("wc", "word") + "-" +
            str(min_count) + "_vc.png")
    if level_type in ["char", "wc"]:
        sent1, sent2 = raw_data["sent1c"], raw_data["sent2c"]
        analyze_wv_vocab_coverage_helper(
            sent1, sent2, min_count, wv_vocab,
            "-".join(wv_path.split("-")[0:-1]).replace("wc", "char") + "-" +
            str(min_count) + "_vc.png")

예제 #2

0

파일 보기

파일: data_gens.py 프로젝트: wslc1314/atec_nlp_sim_update

 def __init__(self, filePath,with_label,dictPathW,dictPathC,modeC,
              is_train=False):
     raw_data = read_cut_file(filePath, with_label, dictPathW,dictPathC,modeC)
     id, la = raw_data['id'], raw_data['label']
     s1w, s2w, s1wl, s2wl = raw_data["sent1w"], raw_data['sent2w'], raw_data['sent1w_len'], raw_data['sent2w_len']
     s1c, s2c, s1cl, s2cl = raw_data["sent1c"], raw_data['sent2c'], raw_data['sent1c_len'], raw_data['sent2c_len']
     if is_train:
         # 对调句1和句2进行数据扩充
         self.df = pd.DataFrame({"id": id + id, "label": la + la,
                                 "sen1w": s1w + s2w, "sen2w": s2w + s1w,
                                 "sen1w_len": s1wl + s2wl, "sen2w_len": s2wl + s1wl,
                                 "sen1c": s1c + s2c, "sen2c": s2c + s1c,
                                 "sen1c_len": s1cl + s2cl, "sen2c_len": s2cl + s1cl,
                                 })
     else:
         self.df = pd.DataFrame({"id": id, "label": la,
                                 "sen1w": s1w, "sen2w": s2w, "sen1w_len": s1wl, "sen2w_len": s2wl,
                                 "sen1c": s1c, "sen2c": s2c, "sen1c_len": s1cl, "sen2c_len": s2cl})
     if with_label:
         df_pos = self.df[self.df["label"] == 1]
         df_neg = self.df[self.df["label"] == 0]
         pn_rate_orig = len(df_pos) / float(len(df_neg))
         print("pn_rate_orig: %f"%pn_rate_orig)
     self.total_size = len(self.df)
     self.cursor = 0
     self.loop = -1
     self.shuffle()
     self.modeC=modeC

예제 #3

0

파일 보기

파일: data_utils.py 프로젝트: wslc1314/atec_nlp_sim

def createLocalWCDict(trainFile,
                      min_count_w=2,
                      min_count_c=2,
                      global_dict_path="data/atec/training-2-2.json"):
    """
    根据训练数据的不同生成不同的动态序号-词/字、词/字-序号字典
    """
    global_dict = loadDict(global_dict_path)
    global_w_v2i = global_dict["word"]["v2i"]
    global_c_v2i = global_dict["char"]["v2i"]
    data = read_cut_file(trainFile, with_label=True)
    sentences = data["sent1w"] + data["sent2w"]
    sentences_c = data["sent1c"] + data["sent2c"]
    savePath = os.path.join(os.path.dirname(trainFile),
                            os.path.basename(trainFile).split(".")[0])
    words, chars = {}, {}
    for sentence in sentences:
        for word in sentence:
            try:
                words[word] += 1
            except:
                words[word] = 1
    for sentence in sentences_c:
        for char in sentence:
            try:
                chars[char] += 1
            except:
                chars[char] = 1
    print("Size for text words: ", len(words.keys()))
    print("Size for global words: ", len(global_w_v2i.keys()))
    print("Size for text chars: ", len(chars.keys()))
    print("Size for global chars: ", len(global_c_v2i.keys()))
    vocab = [w for w in words.keys() if words[w] >= min_count_w]
    vocab_c = [c for c in chars.keys() if chars[c] >= min_count_c]
    vocab = ['<pad>'] + ['<unk>'] + vocab
    vocab_c = ['<pad>'] + ['<unk>'] + vocab_c
    v2i, i2v = {}, {}
    for word in vocab:
        id = global_w_v2i[word]
        v2i[word] = id
        i2v[id] = word
    print("id for <pad>: ", v2i['<pad>'])
    print("id for <unk>: ", v2i['<unk>'])
    print("total vocab size: ", len(v2i.keys()))
    w_dict = {"v2i": v2i, "i2v": i2v}
    v2i, i2v = {}, {}
    for word in vocab_c:
        id = global_c_v2i[word]
        v2i[word] = id
        i2v[id] = word
    print("id for <pad>: ", v2i['<pad>'])
    print("id for <unk>: ", v2i['<unk>'])
    print("total vocab size: ", len(v2i.keys()))
    c_dict = {"v2i": v2i, "i2v": i2v}
    d = {"word": w_dict, "char": c_dict}
    saveDict(
        d,
        savePath + "-" + str(min_count_w) + "-" + str(min_count_c) + ".json")
    return d

예제 #4

0

파일 보기

def get_corpus(file_path="atec/training.csv", corpus_path="atec/atec"):
    """
    根据给定数据生成字级和词级语料库
    """
    target_char = codecs.open(corpus_path + "_char", 'w', encoding='utf-8')
    target_word = codecs.open(corpus_path + "_word", 'w', encoding='utf-8')
    raw_data = read_cut_file(file_path, True)
    w1, w2, c1, c2 = raw_data["sent1w"], raw_data["sent2w"], raw_data[
        "sent1c"], raw_data["sent2c"]
    target_char.writelines([" ".join(c) + "\n" for c in c1])
    target_char.writelines([" ".join(c) + "\n" for c in c2])
    target_word.writelines([" ".join(w) + "\n" for w in w1])
    target_word.writelines([" ".join(w) + "\n" for w in w2])
    print('well done.')
    target_char.close()
    target_word.close()

예제 #5

0

파일 보기

파일: data_utils.py 프로젝트: wslc1314/atec_nlp_sim

def createGlobalWCDict(trainFile="data/atec/training.csv",
                       min_count_w=2,
                       min_count_c=2):
    """
    记录全局动态字/词-序号、序号-字/词对应关系
    """
    data = read_cut_file(trainFile, with_label=True)
    sentences = data["sent1w"] + data["sent2w"]
    sentences_c = data["sent1c"] + data["sent2c"]
    savePath = os.path.join(os.path.dirname(trainFile),
                            os.path.basename(trainFile).split(".")[0])
    words = {}
    chars = {}
    for sentence in sentences:
        for word in sentence:
            try:
                words[word] += 1
            except:
                words[word] = 1
    for sentence in sentences_c:
        for char in sentence:
            try:
                chars[char] += 1
            except:
                chars[char] = 1
    vocab = [w for w in words.keys() if words[w] >= min_count_w]
    vocab_c = [c for c in chars.keys() if chars[c] >= min_count_c]

    int_to_vocab = dict(enumerate(['<pad>'] + ['<unk>'] + vocab))
    vocab_to_int = dict(zip(int_to_vocab.values(), int_to_vocab.keys()))
    print("id for <pad>: ", vocab_to_int['<pad>'])
    print("id for <unk>: ", vocab_to_int['<unk>'])
    print("total vocab size: ", len(list(int_to_vocab.keys())))
    word_dict = {"i2v": int_to_vocab, "v2i": vocab_to_int}

    int_to_vocab = dict(enumerate(['<pad>'] + ['<unk>'] + vocab_c))
    vocab_to_int = dict(zip(int_to_vocab.values(), int_to_vocab.keys()))
    print("id for <pad>: ", vocab_to_int['<pad>'])
    print("id for <unk>: ", vocab_to_int['<unk>'])
    print("total vocab size: ", len(list(int_to_vocab.keys())))
    char_dict = {"i2v": int_to_vocab, "v2i": vocab_to_int}

    cw_dict = {"char": char_dict, "word": word_dict}
    saveDict(
        cw_dict,
        savePath + "-" + str(min_count_w) + "-" + str(min_count_c) + ".json")

예제 #6

0

파일 보기

def label_distribution(trainFile="atec/training.csv"):
    """
    分析训练数据中标签分布情况
    """
    labels = read_cut_file(file_path=trainFile, with_label=True)["label"]
    neg_count = labels.count(0)
    pos_count = labels.count(1)
    assert neg_count + pos_count == len(labels)
    counts = [neg_count, pos_count]
    labels = ["不同义", "同义"]
    fig = plt.figure(figsize=(9, 9))
    # 画饼图（数据，数据对应的标签，百分数保留两位小数点）
    plt.pie(counts, labels=labels, autopct='%1.2f%%')
    plt.title("标签分布", bbox={'facecolor': '0.6', 'pad': 5})
    plt.show()
    savePath = trainFile.split(".")[0] + "_ld.png"
    fig.savefig(savePath)
    plt.close()

예제 #7

0

파일 보기

def sentence_length_distribution(trainFile="atec/training.csv"):
    """
    分析训练数据中句子长度分布
    """
    raw_data = read_cut_file(file_path=trainFile, with_label=True)
    df = pd.DataFrame(raw_data)
    level = ["w", "c"]
    for l in level:
        s1 = "sent1" + l + "_len"
        print(df[s1].describe())
        s2 = "sent2" + l + "_len"
        print(df[s2].describe())
        df_ = pd.DataFrame({s1: df[s1], s2: df[s2]})
        fig = plt.figure(figsize=(32, 18))
        df_.boxplot()
        plt.legend()
        plt.show()
        fig.savefig(trainFile.replace(".csv", "_sl_" + l + ".png"))

예제 #8

0

파일 보기

파일: data_gens.py 프로젝트: wslc1314/atec_nlp_sim_update

 def __init__(self, filePath,with_label,dictPathW,dictPathC,modeC,
              is_train=False,num_buckets=5):
     raw_data = read_cut_file(filePath, with_label, dictPathW,dictPathC,modeC)
     id, la = raw_data['id'], raw_data['label']
     s1w, s2w, s1wl, s2wl = raw_data["sent1w"], raw_data['sent2w'], raw_data['sent1w_len'], raw_data['sent2w_len']
     s1c, s2c, s1cl, s2cl = raw_data["sent1c"], raw_data['sent2c'], raw_data['sent1c_len'], raw_data['sent2c_len']
     if is_train:
         # 对调句1和句2进行数据扩充
         self.df = pd.DataFrame({"id": id + id, "label": la + la,
                                 "sen1w": s1w + s2w, "sen2w": s2w + s1w,
                                 "sen1w_len": s1wl + s2wl, "sen2w_len": s2wl + s1wl,
                                 "sen1c": s1c + s2c, "sen2c": s2c + s1c,
                                 "sen1c_len": s1cl + s2cl, "sen2c_len": s2cl + s1cl,
                                 })
     else:
         self.df = pd.DataFrame({"id": id, "label": la,
                                 "sen1w": s1w, "sen2w": s2w, "sen1w_len": s1wl, "sen2w_len": s2wl,
                                 "sen1c": s1c, "sen2c": s2c, "sen1c_len": s1cl, "sen2c_len": s2cl})
     if with_label:
         df_pos = self.df[self.df["label"] == 1]
         df_neg = self.df[self.df["label"] == 0]
         pn_rate_orig = len(df_pos) / float(len(df_neg))
         print("pn_rate_orig: %f"%pn_rate_orig)
     df = self.df.sort_values("sen1w_len").reset_index(drop=True)
     self.total_size = len(df)
     part_size = self.total_size // num_buckets
     self.dfs = []
     for i in range(num_buckets):
         self.dfs.append(df.ix[i * part_size:(i + 1) * part_size - 1])
     self.dfs[num_buckets-1]=self.dfs[num_buckets - 1].append(df.ix[num_buckets * part_size:self.total_size - 1])
     self.num_buckets = num_buckets
     self.cursor = np.array([0] * num_buckets)
     self.p_list = [1 / self.num_buckets] * self.num_buckets
     self.loop = -1
     self.shuffle()
     self.modeC=modeC

예제 #9

0

파일 보기

파일: data_gens.py 프로젝트: wslc1314/atec_nlp_sim_update

 def __init__(self, filePath,with_label,dictPathW,dictPathC,modeC,
              is_train=False,max_len_w=20,max_len_c=40):
     raw_data = read_cut_file(filePath, with_label, dictPathW,dictPathC,modeC)
     id, la = raw_data['id'], raw_data['label']
     s1w, s2w, s1wl, s2wl = raw_data["sent1w"], raw_data['sent2w'], raw_data['sent1w_len'], raw_data['sent2w_len']
     s1c, s2c, s1cl, s2cl = raw_data["sent1c"], raw_data['sent2c'], raw_data['sent1c_len'], raw_data['sent2c_len']
     if is_train:
         # 对调句1和句2进行数据扩充
         self.df = pd.DataFrame({"id": id + id, "label": la + la,
                                 "sen1w": s1w + s2w, "sen2w": s2w + s1w,
                                 "sen1w_len": s1wl + s2wl, "sen2w_len": s2wl + s1wl,
                                 "sen1c": s1c + s2c, "sen2c": s2c + s1c,
                                 "sen1c_len": s1cl + s2cl, "sen2c_len": s2cl + s1cl,
                                 })
     else:
         self.df = pd.DataFrame({"id": id, "label": la,
                                 "sen1w": s1w, "sen2w": s2w, "sen1w_len": s1wl, "sen2w_len": s2wl,
                                 "sen1c": s1c, "sen2c": s2c, "sen1c_len": s1cl, "sen2c_len": s2cl})
     if with_label:
         df_pos = self.df[self.df["label"] == 1]
         df_neg = self.df[self.df["label"] == 0]
         pn_rate_orig = len(df_pos) / float(len(df_neg))
         print("pn_rate_orig: %f"%pn_rate_orig)
     self.total_size = len(self.df)
     res1 = np.zeros(shape=[self.total_size, max_len_w], dtype=np.int32)
     res2 = np.zeros(shape=[self.total_size, max_len_w], dtype=np.int32)
     self.df["sen1w_len"] = min_ele_array(self.df["sen1w_len"].values, max_len_w)
     self.df["sen2w_len"] = min_ele_array(self.df["sen2w_len"].values, max_len_w)
     if modeC == 0:
         res1_c = np.zeros(shape=[self.total_size, max_len_c], dtype=np.int32)
         res2_c = np.zeros(shape=[self.total_size, max_len_c], dtype=np.int32)
         self.df["sen1c_len"] = min_ele_array(self.df["sen1c_len"].values, max_len_c)
         self.df["sen2c_len"] = min_ele_array(self.df["sen2c_len"].values, max_len_c)
         for idx in range(self.total_size):
             # 少的pad。
             res1[idx, :self.df["sen1w_len"].values[idx]] = \
                 self.df["sen1w"].values[idx][:self.df["sen1w_len"].values[idx]]
             res2[idx, :self.df["sen2w_len"].values[idx]] = \
                 self.df["sen2w"].values[idx][:self.df["sen2w_len"].values[idx]]
             res1_c[idx, :self.df["sen1c_len"].values[idx]] = \
                 self.df["sen1c"].values[idx][:self.df["sen1c_len"].values[idx]]
             res2_c[idx, :self.df["sen2c_len"].values[idx]] = \
                 self.df["sen2c"].values[idx][:self.df["sen2c_len"].values[idx]]
     else:
         res1_c = np.zeros(shape=[self.total_size, max_len_w, modeC], dtype=np.int32)
         res2_c = np.zeros(shape=[self.total_size, max_len_w, modeC], dtype=np.int32)
         for idx in range(self.total_size):
             # 少的pad。
             res1[idx, :self.df["sen1w_len"].values[idx]] = \
                 self.df["sen1w"].values[idx][:self.df["sen1w_len"].values[idx]]
             res2[idx, :self.df["sen2w_len"].values[idx]] = \
                 self.df["sen2w"].values[idx][:self.df["sen2w_len"].values[idx]]
             for jdx in range(self.df["sen1w_len"].values[idx]):
                 res1_c[idx, jdx] = self.df["sen1c"].values[idx][jdx]
             for jdx in range(self.df["sen2w_len"].values[idx]):
                 res2_c[idx, jdx] = self.df["sen2c"].values[idx][jdx]
     self.df["sen1w"]=res1.tolist()
     self.df["sen2w"]=res2.tolist()
     self.df["sen1c"]=res1_c.tolist()
     self.df["sen2c"]=res2_c.tolist()
     self.cursor = 0
     self.loop = -1
     self.shuffle()
     self.modeC=modeC

예제 #10

0

파일 보기

파일: base.py 프로젝트: wslc1314/atec_nlp_sim

    def evaluate(self, validFile=None, dictPath=None, load_path=None):
        """
        :param dictPath: 模型训练数据对应的local dict path
        """
        assert validFile is not None and dictPath is not None and load_path is not None
        if self.config.wv_config["train_w"]:
            dictPathW = dictPath
        else:
            dictPathW = self.config.global_dict
        if self.config.wv_config["train_c"]:
            dictPathC = dictPath
        else:
            dictPathC = self.config.global_dict
        val_generator = DataIterator(validFile, True, dictPathW, dictPathC,
                                     self.config.modeC)
        load_dir = load_path if os.path.isdir(load_path) else os.path.dirname(
            load_path)
        log_dir = ensure_dir_exist(load_dir.replace("checkpoints", "logs"))
        logger = my_logger(log_dir + "/log_evaluate.txt")
        logger.info("Evaluating with file: %s, local dict: %s..." %
                    (validFile, dictPath))

        os.environ['CUDA_VISIBLE_DEVICES'] = str(0)
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.6
        with tf.Session(config=config, graph=self.graph) as sess:
            logger.info("Loading model...")
            saver = tf.train.Saver(self.var_list)
            if os.path.isdir(load_path):
                ckpt = tf.train.get_checkpoint_state(load_path)
                saver.restore(sess, ckpt.model_checkpoint_path)
                global_step = ckpt.model_checkpoint_path.split("-")[-1]
            else:
                saver.restore(sess, load_path)
                global_step = load_path.split("-")[-1]
            logger.info("Loading successfully, loading epoch is %s" %
                        global_step)
            batch = val_generator.next(1024, need_all=True)
            res = {}
            while val_generator.loop == 0:
                pos_prob, pred = sess.run(
                    [self.pos_prob, self.predicted],
                    feed_dict=self._get_valid_feed_dict(batch))
                for (id, p, la, pr) in zip(batch["id"], pos_prob,
                                           batch["label"], pred):
                    res[id] = [float(p), int(la), int(pr)]
                batch = val_generator.next(1024, need_all=True)
            res = [[int(key),
                    float(value[0]),
                    int(value[1]),
                    int(value[2])] for (key, value) in res.items()]
            tmp = pd.DataFrame(res,
                               columns=["id", "pos_prob", "label", "pred"])
            tmp = tmp.sort_values(by="id", axis=0, ascending=True)
            id = np.asarray(tmp["id"].values, dtype=np.int)
            id_v = read_cut_file(validFile, True)["id"]
            assert np.allclose(np.sort(id),
                               np.array(id_v)), "Inconsistent indices!"
            for t in np.arange(0, 1, 0.05):
                pred = np.greater_equal(tmp["pos_prob"].values,
                                        np.asarray([t]))
                pred = np.asarray(pred, dtype=np.int)
                if t == 0.5:
                    assert np.allclose(
                        pred, tmp["pred"].values), "Inconsistent prediction!"
                f1 = f1_score(y_pred=pred, y_true=tmp["label"])
                acc = accuracy_score(y_pred=pred, y_true=tmp["label"])
                pre = precision_score(y_pred=pred, y_true=tmp["label"])
                rec = recall_score(y_pred=pred, y_true=tmp["label"])
                logger.info(
                    "Threshold: %02f, F1: %.4f, A: %.4f, P: %.4f, R: %.4f" %
                    (t, f1, acc, pre, rec))