示例#1
0
    def multi_word_cut(self, sentences):
        print('Multiprocessing Word cut ')
        if self.language == 'ch':
            jieba.initialize(
            )  # initialize first, or it will initialize in each process
            jieba.disable_parallel()

            def func(line):
                line = [i.strip() for i in jieba.cut(line, cut_all=False)]
                return [
                    i for i in line
                    if ((not i.isdigit()) and (i not in self.stop_words))
                ]
        else:

            def func(line):
                return [i.lower() for i in line.split(" ") if ((not i.isdigit()) and \
                                                       (i not in self.stop_words) and \
                                                       (len(i) >1 ) )]

        pool = Pool(nodes=5)
        t0 = time.time()
        word_cut = pool.map(func, sentences)
        pool.close()
        pool.join()
        pool.clear()
        print('MultiProcess  time {:.0f}'.format(time.time() - t0))
        return word_cut
示例#2
0
def map_get_words(txts, kind="char", return_type="str"):
    if isinstance(txts, str):
        with open(txts, "r") as f:
            txts = [row.strip() for row in f.readlines()]
    jieba = None
    if kind == "word":
        import jieba_fast as jieba
        jieba.initialize()
        jieba.load_userdict("dict_fasttext.txt")
    txts = list(map(lambda txt: get_words(txt, kind, return_type, jieba),
                    txts))
    return txts
示例#3
0
        dataset = dataset.shuffle(batch_size * 1000)  # 打乱
        dataset = dataset.batch(batch_size)  # 成批

        return dataset


if __name__ == '__main__':

    # 使用测试

    from bert4keras.tokenizer import Tokenizer
    import json, glob, re
    import jieba_fast as jieba
    from tqdm import tqdm

    jieba.initialize()
    dict_path = '/home/spaces_ac_cn/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
    tokenizer = Tokenizer(dict_path)

    def some_texts():
        for _ in range(2): # 数据重复两遍
            filenames = glob.glob('/home/spaces_ac_cn/corpus/*/*/*')
            np.random.shuffle(filenames)
            for filename in filenames:
                with open(filename) as f:
                    for l in f:
                        l = json.loads(l)['text'].strip()
                        yield re.findall(u'.*?[\n。]+', l)

    def word_segment(text):
        return jieba.lcut(text)
示例#4
0
    import jieba_fast.posseg
    posdelim = args.pos

    def cutfunc(sentence, _, HMM=True):
        for w, f in jieba_fast.posseg.cut(sentence, HMM):
            yield w + posdelim + f
else:
    cutfunc = jieba.cut

delim = text_type(args.delimiter)
cutall = args.cutall
hmm = args.hmm
fp = open(args.filename, 'r') if args.filename else sys.stdin

if args.dict:
    jieba.initialize(args.dict)
else:
    jieba.initialize()
if args.user_dict:
    jieba.load_userdict(args.user_dict)

ln = fp.readline()
while ln:
    l = ln.rstrip('\r\n')
    result = delim.join(cutfunc(ln.rstrip('\r\n'), cutall, hmm))
    if PY2:
        result = result.encode(default_encoding)
    print(result)
    ln = fp.readline()

fp.close()