Python Tokenizer示例，src.utils.tokenization.Tokenizer Python示例

示例#1

0

显示文件

def load_data_test():
    dp, dict_init, stop_words = data_process.DataPressing(), dicts.init(
    ), load_stop_words()
    # 分词
    tk = tokenization.Tokenizer(dp, stop_words)
    # 获取三张表中的所有新闻
    df_result = get_data()  # 接口已经改变，调用时需要注意
    res_lists = []
    for index, row in df_result.iterrows():
        title, content = row["title"], row["content"]
        if title is not None and title:
            title = dp.no_remove(title)
            if not dp.useless_filter(title, dicts.stock_dict):
                title_list = tk.token(title)
                res_lists.append(title_list)

        if content is not None and content:
            content = dp.no_remove(content)
            if not dp.useless_filter(content, dicts.stock_dict):
                content_list = tk.token(content)
                res_lists.append(content_list)
    file_out = open("text.txt", "w")
    for index in res_lists:
        item = ",".join(item for item in index)
        file_out.write(item.encode("utf8") + "\n")
    file_out.close()

示例#2

0

显示文件

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file", type=str, required=True, help='Input raw text file.')
    parser.add_argument("--output_file", type=str, required=True, help='Output MindRecord file.')
    parser.add_argument("--num_splits", type=int, default=1,
                        help='The MindRecord file will be split into the number of partition. ')
    parser.add_argument("--max_length", type=int, required=True, help='Maximum sequence length.')
    parser.add_argument("--vocab_file", type=str, required=True, default='', help='url of gpt2-vocab.json ')
    parser.add_argument("--merge_file", type=str, required=True, default='', help='url of gpt2-merges.txt ')
    parser.add_argument("--mode", type=str, required=True, default='cnn_dailymail', help='mode of dataset creation')
    args = parser.parse_args()

    tokenizer = tokenization.Tokenizer(vocab_file=args.vocab_file, merge_file=args.merge_file, mode=args.mode)
    input_file = args.input_file
    logging.info("***** Reading from input files *****")
    logging.info("Input File: %s", input_file)

    output_file = args.output_file
    logging.info("***** Writing to output files *****")
    logging.info("Output File: %s", output_file)

    writer = FileWriter(output_file, args.num_splits)
    data_schema = {"input_ids": {"type": "int64", "shape": [-1]},
                   "input_mask": {"type": "int64", "shape": [-1]},
                   "label_ids": {"type": "int64", "shape": [-1]}
                   }
    writer.add_schema(data_schema, "wikitext2-schema")

    total_written = 0
    total_read = 0

    logging.info("***** Reading from  %s *****", input_file)
    with open(input_file, "r") as f:
        while True:
            line = f.readline()
            if not line:
                break
            total_read += 1
            if total_read % 500 == 0:
                logging.info("%d ...", total_read)

            output = create_instance(tokenizer, line, args.max_length)
            features = write_instance_to_file(writer, instance=output)
            total_written += 1

            if total_written <= 20:
                logging.info("***** Example *****")
                logging.info("input tokens: %s", tokenizer.decode(output["input_ids"][:-1]))
                logging.info("label tokens: %s", tokenizer.decode(output["input_ids"][1:]))

                for feature_name in features.keys():
                    feature = features[feature_name]
                    logging.info("%s: %s", feature_name, feature)

    writer.commit()
    logging.info("Wrote %d total instances", total_written)

示例#3

0

显示文件

 def _cut_sentence(self, sentence):
     """
     # 对句子进行分词
     :return:
     """
     # 使用多进程的时候需要修改一下
     dp, dict_init, stop_words = data_process.DataPressing(), dicts.init(
     ), tokenization.load_stop_words()
     tk = tokenization.Tokenizer(dp, stop_words)
     self.word_list = tk.token(sentence)

示例#4

0

显示文件

def data_save():
    """
    读取数据库中的内容，文本预处理之后，保存成本地，主要用于singlepass进行历史事件的聚类使用，用于词向量训练，关键词提取等操作。

    :return:
    """
    dp, dict_init, stop_words = data_process.DataPressing(), dicts.init(
    ), load_stop_words()
    tk = tokenization.Tokenizer(dp, dict_init, stop_words)  # 分词
    df_result = get_data()
    # df_result.ix[:, ["content"]].apply(tk.token)
    # 提取dataFrame中的title和content的内容，然后分别进行预处理，

    # 方式一、标题和正文保存为同一个新闻，且新闻标题和正文同时存在
    res_lists = []
    for i in tqdm(range(len(df_result))):
        news_id = df_result.iloc[i]['id']
        title = df_result.iloc[i]['title']
        content = df_result.iloc[i]['content']
        unix_time = df_result.iloc[i]['unix_time']
        if content and title:
            news_id = news_id.strip()
            title = title.strip()
            string = title.strip() + content.strip()
            string_list = tk.token(string)
            if not dp.useless_filter(string_list, dicts.stock_dict):
                # string_list = keywords_extractor.parallel_test(string_list)  # 提取关键词
                res_lists.append((news_id, title, string_list,
                                  unix_time))  # 根据上面的具体格式，组成tuple
                # res_lists.append((string, unix_time))  # 根据上面的具体格式，组合成tuple
    logging.logger.info("提取的文章的个数: %s" % len(res_lists))
    # 数据更新
    # 保存新闻的新闻ID，发布时间， 分词后的正文；[news_id, timestamp, contents]
    # file_out = open("./data/text_full_index.txt", "w")
    file_out = open(conf.corpus_news, "w")
    for index, content in enumerate(res_lists):
        item = " ".join(item for item in content[2])
        file_out.write(
            str(content[0]) + "\t" + str(content[3]) + "\t" +
            item.encode("utf8") + "\n")
    file_out.close()
    # 保存新闻的新闻ID， 发布时间， 新闻标题；[news_id, timestamp, title]
    # file_out = open("./data/text_title_index.txt", "w")
    file_out = open(conf.corpus_news_title, "w")
    for index, content in enumerate(res_lists):
        file_out.write(
            str(content[0]) + "\t" + str(content[3]) + "\t" + content[1] +
            "\n")
    file_out.close()

示例#5

0

显示文件

def d_test():
    """
    类接口测试
    :return:
    """
    # s = '程序员(英文Programmer)是从事程序开发、维护的专业人员。' \
    #     '一般将程序员分为程序设计人员和程序编码人员，但两者的界限并不非常清楚，' \
    #     '特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。'

    # s = '【今日题材】[AI决策]大智慧的股票真烂，中美贸易战打得好，中美贸易摩擦擦出爱情火花！科创板也上市了，' \
    #     '还是注册制的, 关注同花顺财经（ths58）， 获取更多机会。'

    s = '中兴通讯（000063）在经历七个一字跌停板后，于今天打开跌停板。债转股开盘大涨，天津普林（002134）、信达地产（600657）' \
        '、海德股份（000567）集体涨停，长航凤凰（000520）、浙江东方（600120）、陕国投A（000563）大涨，消息面上，' \
        '央行宣布定向降准0.5个百分点，将重点支持债转股。中兴通讯机构最低估值12.02元/股在复牌之前，' \
        '多家基金公司对中兴通讯估值大多调整至20.54元/股。连续7个跌停板后，中兴通讯A股股价早就已经跌穿这一价格。' \
        '据《中国经营报》记者不完全统计，6月20日～22日，多家基金公司再做出调整中兴通讯A股估值的公告，下调公司包括工银瑞信基金、' \
        '华泰柏瑞基金、东方基金、大摩华鑫基金、融通基金、大成基金等22家基金公司。值得注意的是，此次基金公司估值下调幅度并不一致，' \
        '调整估值在每股12.02～16.64元之间。其中，大摩华鑫基金、融通基金和安信基金给出的估值最高，为每股16.64元，而工银瑞信基金、' \
        '富国基金和泰达宏利基金给出的估值最低，为每股12.02元。关注同花顺财经（ths518），获取更多机会'

    # s = u"水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露，" \
    #     u"根据刚刚完成了水资源管理制度的考核，有部分省接近了红线的指标，\n" \
    #     u"有部分省超过红线的指标。对一些超过红线的地方，\n陈明忠表示，对一些取用水项目进行区域的限批，" \
    #     u"严格地进行水资源论证和取水许可的批准。"

    dp, dict_init, stop_words = data_process.DataPressing(), dicts.init(
    ), tokenization.load_stop_words()
    tk = tokenization.Tokenizer(dp, stop_words)
    s_list = tk.token(s)
    # 根据句子的长度，动态划分关键词的个数
    # top_k = int(len(s_list) * 0.1)
    # text_rank = TextRank(s_list, top_k=15, with_weight=True)

    text_rank = TextRank(top_k=15)
    res = text_rank.run(s_list)
    logging.logger.info("提取的%s个关键词: " % len(res))
    if text_rank.withWeight:
        print(",".join(item[0] for item in res))
        print(",".join(str(item[1]) for item in res))
    else:
        print(",".join(str(item) for item in res))

示例#6

0

显示文件

def multi_extract_test():
    """
    多进程测试
    :return:
    """
    import time
    from multiprocessing import Pool
    import multiprocessing as mp

    s = '程序员(英文Programmer)是从事程序开发、维护的专业人员。' \
        '一般将程序员分为程序设计人员和程序编码人员，但两者的界限并不非常清楚，' \
        '特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。'

    dp = data_process.DataPressing()
    dict_init = dicts.init()
    stop_words = tokenization.load_stop_words()
    # 分词
    tk = tokenization.Tokenizer(dp, stop_words)
    s_list = tk.token(s)
    t0 = time.time()
    for i in range(10000):
        parallel_test(s_list)
    logging.logger.info("串行处理花费时间{t}".format(t=time.time() - t0))

    pool = Pool(processes=int(mp.cpu_count()))
    res_l = []
    t1 = time.time()
    for i in range(10000):
        res = pool.apply_async(parallel_test, (s_list, ))
        res_l.append(res)
    # pool.map(parallel_test, s_list)

    # for i in res_l:
    #     print i.get()
    pool.close()
    pool.join()
    logging.logger.info("并行处理花费时间{t}s".format(t=time.time() - t1))

示例#7

0

显示文件

文件： dynamic_update_event.py 项目： STHSF/EventsParser

logging.logger.info('读取新闻的起始时间: {}'.format(today))
ordered_df = data_reader.get_ordered_data(timestamp=today_timestamp)

# load tf-idf VSM
# tfidf_feature_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/feature_1.pkl'
# tfidftransformer_path = '/Users/li/PycharmProjects/event_parser/src/model/tfidf_model/tfidftransformer_1.pkl'
tfidf_feature_path = conf.tfidf_feature_path
tfidf_transformer_path = conf.tfidftransformer_path
tfidf_feature = tfidf.load_tfidf_feature(tfidf_feature_path)
tfidf_transformer = tfidf.load_tfidf_transformer(tfidf_transformer_path)

# 导入词典，停用词，数据处理接口，分词接口
dp, dict_init, stop_words = data_process.DataPressing(), dicts.init(
), tokenization.load_stop_words()
tk = tokenization.Tokenizer(dp, stop_words)

# 提取dataFrame中的内容
ordered_news_lists = data_reader.trans_df_data(ordered_df, tfidf_feature,
                                               tfidf_transformer, dp, tk)

# 如果当天没有新闻更新，则直接退出程序，事件单元不需要更新。
# 文章重复更新，
if len(ordered_news_lists) <= 0:
    # print '今天没有新新闻，事件单元不更新'
    logging.logger.info('[事件库未更新]: 今天没有新新闻，事件单元不更新')
    sys.exit()

# for tmp in ordered_news_lists:
#     print tmp[0], tmp[1]