Python cut примеры, jieba_fast.cut Python примеры использования

Пример #1

0

Показать файл

Файл: get_embedding.py Проект: Brokenwind/DeepSim

    def __input_data(sentence1, sentence2, dtype="word", input_length=20):
        data_left_sentence = []
        data_right_sentence = []
        for s1, s2 in zip(sentence1, sentence2):
            if dtype == "word":
                # 句子中出现连续*号,表示数字
                star = re.compile("\*+")
                data_left_sentence.append([
                    word2index[word]
                    for word in list(jieba.cut(star.sub("1", s1)))
                    if word in word2index
                ])
                data_right_sentence.append([
                    word2index[word]
                    for word in list(jieba.cut(star.sub("1", s2)))
                    if word in word2index
                ])
            if dtype == "char":
                data_left_sentence.append(
                    [char2index[char] for char in s1 if char in char2index])
                data_right_sentence.append(
                    [char2index[char] for char in s2 if char in char2index])

        # 对齐语料中句子的长度
        data_left_sentence = pad_sequences(data_left_sentence,
                                           maxlen=input_length)
        data_right_sentence = pad_sequences(data_right_sentence,
                                            maxlen=input_length)

        return [data_left_sentence, data_right_sentence]

Пример #2

0

Показать файл

    def __iter__(self):
        with open(model_dir + "atec_nlp_sim_train.csv","r", encoding="utf8") as atec:
            for line in atec:
                lineno, s1, s2, label=line.strip().split("\t")    
                yield list(jieba.cut(s1)) + list(jieba.cut(s2))

        with open("resources/wiki_corpus/wiki.csv",'r',encoding="utf8") as wiki:
            for line in wiki:
                title, doc = line.strip().split("|")
                for sentense in doc.split("#"):
                    if len(sentense)>0:
                        yield [word for word in list(jieba.cut(sentense)) if word and 0x4E00<= ord(word[0]) <= 0x9FA5]

Пример #3

0

Показать файл

Файл: get_embedding.py Проект: Brokenwind/DeepSim

    def __load_data(dtype="word", input_length=20, w2v_length=VECTOR_LENGTH):
        filename = os.path.join(MODEL_DIR,
                                "%s_%d_%d" % (dtype, input_length, w2v_length))
        if os.path.exists(filename):
            return pd.read_pickle(filename)

        data_left_sentence = []
        data_right_sentence = []
        labels = []
        for line in open(ANT_NLP_FILE_PATH, "r", encoding="utf8"):
            line_number, sentence1, sentence2, label = line.strip().split("\t")
            # 句子中出现连续*号,表示数字
            star = re.compile("\*+")
            sentence1 = remove_punctuation(star.sub("1", sentence1))
            sentence2 = remove_punctuation(star.sub("1", sentence2))
            if dtype == "word":
                data_left_sentence.append([
                    word2index[word] for word in list(jieba.cut(sentence1))
                    if word in word2index
                ])
                data_right_sentence.append([
                    word2index[word] for word in list(jieba.cut(sentence2))
                    if word in word2index
                ])
            if dtype == "char":
                data_left_sentence.append([
                    char2index[char] for char in sentence1
                    if char in char2index
                ])
                data_right_sentence.append([
                    char2index[char] for char in sentence2
                    if char in char2index
                ])
            labels.append(int(label))

        logging.info('length of featured sentence is ' +
                     str(len(data_left_sentence)))
        # 对齐语料中句子的长度
        data_left_sentence = pad_sequences(data_left_sentence,
                                           maxlen=input_length)
        data_right_sentence = pad_sequences(data_right_sentence,
                                            maxlen=input_length)
        labels = np.array(labels)

        pd.to_pickle((data_left_sentence, data_right_sentence, labels),
                     filename)

        return (data_left_sentence, data_right_sentence, labels)

Пример #4

0

Показать файл

def load_data_and_labels_multiclass(sql, stops_words):
    """
    Loads data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    
    engine = create_engine("mysql+pymysql://***:***@***.*.*.*:3306/FlatWhite?charset=utf8",encoding = 'utf-8')
    
    # Load stop words
    with codecs.open(stops_words, "r", "utf-8") as file:
        stops_words = [line.strip() for line in file.readlines()]
    
    # Load data from files
    data = pd.read_sql(sql,con=engine) 
    x_corpus = data['corpus'].tolist()
    
    # Map the actual labels to one hot labels
    labels = sorted(set(data['label'].tolist()))
    one_hot = np.eye(len(labels),dtype = int)
    label_dict = dict(zip(labels, one_hot))
    
    x_raw = [[item for item in jieba.cut(s) if item not in stops_words] for s in x_corpus]
    y_raw = data['label'].map(lambda s : label_dict[s]).tolist()
    

    return x_raw, y_raw, data

Пример #5

0

Показать файл

def filter_alphabet(one_dict):
    """
    过滤字母，多为基因序列等
    :param one_dict:
    :return:
    """
    one_pattern = r'[a-zA-Z]{3,}'
    pattern = re.compile(one_pattern)

    for key in list(one_dict):
        max_char = key.split("-")[-1]  # 获取最大的字符

        if max_char.isalpha():  # 判断字符是否是英文字符，若不是则不处理，因为如??????adfadsf
            re_res = pattern.findall(one_dict[key])
            re_res_length = len(re_res)

            seg_res = jieba.cut(one_dict[key].strip(), cut_all=True)
            seg_res_list = list(seg_res)
            seg_res_list = [i for i in seg_res_list if i != '']
            seg_res_list_length = len(seg_res_list)

            if seg_res_list_length == 0:
                continue
            elif seg_res_list_length * 0.5 <= re_res_length:
                one_dict.pop(key)

    return one_dict

Пример #6

0

Показать файл

Файл: 普通---多进程多线程版.py Проект: kid1999/pythonLearning

def count():  #统计字频
    with open(r'C:\Users\Administrator\Desktop\python计字频\斗破苍穹.txt', 'r') as f:
        str = f.read()
    words = jieba.cut(str)
    word = [x for x in words if len(x) > 1]
    dict = Counter(word)
    print(dict)

Пример #7

0

Показать файл

Файл: evaluate.py Проект: zx1239856/mogicPinyin

def eval(infile, ime):
    tot = 0
    corr = 0
    tot_sen = 0
    corr_sen = 0
    corpus = []
    with open(infile, "r", encoding="utf-8") as inf:
        for line in tqdm(inf.readlines()):
            line = punc.sub('\n', line)
            line = line.split('\n')
            for item in line:
                if (len(item)):
                    corpus.append(item)
    tot_sen = len(corpus)
    for item in tqdm(corpus):
        pinyin = []
        item = filtrate.sub('', item)
        charlist = jieba_fast.cut(item)
        for word in charlist:
            pinyin.extend(list(lazy_pinyin(word)))
        try:
            res = ime.predictio(' '.join(pinyin))
            dis = Levenshtein.distance(res, item)
            tot += len(item)
            corr += (len(item) - dis)
            if (dis == 0):
                corr_sen += 1
        except:
            pass
    print("Prediction precision: (word)%f%%, (sentence)%f%%" %
          ((corr * 100 / tot), (corr_sen * 100 / tot_sen)))
    return corr * 100 / tot

Пример #8

0

Показать файл

    def word_cut(self, sentences):
        if self.language == 'ch':
            func = lambda line: [
                i.strip() for i in jieba.cut(line, cut_all=False)
            ]
        else:
            func = lambda line: line.split(" ")

        ##TODO: remove stop words or mark stop words
        t0 = time.time()
        word_cut = []
        for line in tqdm(sentences):
            try:
                words = func(line)
                if self.language == 'ch':
                    words = [
                        i for i in words
                        if ((not i.isdigit()) and (i not in self.stop_words))
                    ]
                else:
                    words = [
                        i for i in words if ((not i.isdigit()) and (
                            i not in self.stop_words) and (len(i) > 1))
                    ]
                if len(words) > 1:
                    word_cut.append(words)
            except Exception as e:
                print(line)
                print(e)
                continue
        print('Single Process time {:.0f}'.format(time.time() - t0))
        return word_cut

Пример #9

0

Показать файл

Файл: pai_old.py Проект: ziweipolaris/atec2018-nlp

def pre_process(df, train_mode=True):
    x = lambda s: list(jieba.cut(star.sub("X", s)))
    df["words1"] = df["sent1"].apply(x)
    df["words2"] = df["sent2"].apply(x)
    if train_mode:
        df.to_csv(clean_path, sep="\t", index=False, encoding="utf8")
    return df

Пример #10

0

Показать файл

Файл: train_embedding.py Проект: a625687551/wordembedding

def make_segment_file():
    print("seement file start")
    jieba.suggest_freq('沙瑞金', True)
    jieba.suggest_freq('田国富', True)
    jieba.suggest_freq('高育良', True)
    jieba.suggest_freq('侯亮平', True)
    jieba.suggest_freq('钟小艾', True)
    jieba.suggest_freq('陈岩石', True)
    jieba.suggest_freq('欧阳菁', True)
    jieba.suggest_freq('易学习', True)
    jieba.suggest_freq('王大路', True)
    jieba.suggest_freq('蔡成功', True)
    jieba.suggest_freq('孙连城', True)
    jieba.suggest_freq('季昌明', True)
    jieba.suggest_freq('丁义珍', True)
    jieba.suggest_freq('郑西坡', True)
    jieba.suggest_freq('赵东来', True)
    jieba.suggest_freq('高小琴', True)
    jieba.suggest_freq('赵瑞龙', True)
    jieba.suggest_freq('林华华', True)
    jieba.suggest_freq('陆亦可', True)
    jieba.suggest_freq('刘新建', True)
    jieba.suggest_freq('刘庆祝', True)

    with open("./in_the_name_of_people.txt") as f:
        document = f.read()
        d_cut = jieba.cut(document)
        res = " ".join(d_cut)
    with open("./segment_doc.txt", "w") as f:
        f.write(res)
    print("segment file ok")

Пример #11

0

Показать файл

Файл: wiki_torch.py Проект: a625687551/wordembedding

def make_cut(model_file):
    with open(model_file, "r", encoding="utf-8") as f:
        doc = f.read()
    d_cut = " ".join(jieba.cut(doc))
    with open("wiki_cut.txt", "w", encoding="utf-8") as f:
        f.write(d_cut)
    return d_cut

Пример #12

0

Показать файл

Файл: docsim_2.py Проект: lzbbest/Economical-News-Analysis

def testcut(testD,stopword,dictionary,similarity):
    s_id,t_id = [],[]
    for i in testD.index:
        text = testD.loc[i].values[0].strip()
        text = re.sub('[\"*\【\】\[\]\s*]','',text) # sub Special symbol
        text =re.sub('\([a-zA-z]+://[^\s]*\)','',text) # substitute URL
        text = re.sub('\d+\.*\d*','',text)
        text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", '',text)        
        #cuting = jieba.cut(text)
        #cuting = ' '.join(cuting)
        temp = list(jieba.cut(text,HMM=True))
        #temp=thu1.cut(text,text=True).split()
        word_list = temp
        '''###
        word_list = []
        for word in temp:
            if word not in stopword:  
                word_list.append(word)  
        '''###
        test_corpus = dictionary.doc2bow(word_list)
        similarity.num_best = 21
        temp_id = []
        [temp_id.append(int(item[0])+1) for item in similarity[test_corpus]]
        if i not in temp_id:
            t_id.extend(temp_id[:20])
        else:
            temp_id.remove(i)
            t_id.extend(temp_id)
        [s_id.append(i) for j in range(20)]
    dfre = pd.DataFrame({'source_id':s_id,'target_id':t_id})
    return dfre

Пример #13

0

Показать файл

Файл: docsim_2.py Проект: lzbbest/Economical-News-Analysis

def tcutword(data,stopword):
    corpora_documents = []
    for i in data.index:
        text = data.loc[i].values[0].strip()
        text = re.sub('[\"*\【\】\[\]\s*]','',text) # sub Special symbol
        text =re.sub('\([a-zA-z]+://[^\s]*\)','',text) # substitute URL
        text = re.sub('\d+\.*\d*','',text)
        text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", '',text)        
        #cuting = jieba.cut(text)
        #cuting = ' '.join(cuting)
        temp = list(jieba.cut(text,HMM=True))
        
        #temp=thu1.cut(text,text=True).split()
        word_list = temp
        '''
        word_list = []  
        for word in temp:
            if word not in stopword:  
                word_list.append(word)  
        #text = ' '.join(temp)
        '''
        corpora_documents.append(word_list)
    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(ttext) for ttext in corpora_documents]
    similarity = similarities.Similarity('-Similarity-index', corpus, num_features=99999999)
    return dictionary,similarity

Пример #14

0

Показать файл

Файл: data.py Проект: lastmc/WenWenNews

def get_processed_content_from_content(event_dict):
    cat_content = event_dict['title'] +\
                  ('.' if event_dict['lang'] == 'en' else '。') +\
                  event_dict['content']
    stop_words = ['。', ' ', '，', '.', ',', '的', '-', '了', '新冠', '病毒', '、',
                  '研究', '和', '在', '发现', '中', '患者', '冠状病毒', '与', '肺炎',
                  '团队', '人员', '（', '）', '是', '该', '对', '为']
    return [word for word in jieba.cut(cat_content) if word not in stop_words]

Пример #15

0

Показать файл

Файл: data.py Проект: lastmc/WenWenNews

def get_processed_content(brief):
    """first get content by _id, then concatenate title and content,
    then use jieba to cut the sentences, return list of words"""
    event_dict = get_content_by_id(get_id(brief))
    cat_content = event_dict['title'] +\
                  '.' if event_dict['lang'] == 'en' else '。' +\
                  event_dict['content']
    return [word for word in jieba.cut(cat_content)]

Пример #16

0

Показать файл

Файл: cctv.py Проект: 1060460048/jieba_demo

    def extract_two(self, file=None):
        fn = codecs.open(file, 'r+', encoding='utf-8')
        string_data = fn.read()
        fn.close()

        # 文本预处理
        pattern = re.compile(
            '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~。“”、：？，【】！（）——↓0-9a-zA-Z\.\.\.\.\.\.]+'
        )
        # pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"')
        string_data = re.sub(pattern, '', string_data)  # 将符合模式的字符去除
        string_data = string_data.replace('\n', '')
        string_data = string_data.replace('\u3000', '')
        string_data = string_data.replace('\r', '')
        string_data = string_data.replace(' ', '')
        logging.info(string_data)

        # 文本分词
        seg_list_exact = jieba.cut(string_data, cut_all=False)  # 精确模式分词
        object_list = []
        remove_words_custom = [
            u'的', u'，', u'和', u'是', u'随着', u'对于', u'对', u'等', u'能', u'都', u'。',
            u' ', u'、', u'中', u'在', u'了', u'通常', u'如果', u'我们', u'需要', u'月',
            u'日'
        ]  # 自定义去除词库
        remove_words = self.parse_multiple_files(
            ['中文停用词表.txt', '哈工大停用词表.txt', '四川大学机器智能实验室停用词库.txt', '百度停用词表.txt'])
        remove_words = remove_words_custom + remove_words
        for word in seg_list_exact:  # 循环读出每个分词
            if word not in remove_words:  # 如果不在去除词库中
                logging.info('\n')
                logging.info(word)
                object_list.append(word)  # 分词追加到列表
        logging.info(object_list)

        # 词频统计
        word_counts = collections.Counter(object_list)  # 对分词做词频统计
        word_counts_top10 = word_counts.most_common(10)  # 获取前10最高频的词
        print(word_counts_top10)  # 输出检查
        # 词频展示
        font_path = r'C:\Windows\Fonts\simfang.ttf'
        mask = np.array(Image.open('background.jpg'))  # 定义词频背景
        wc = wordcloud.WordCloud(
            background_color='white',  # 设置背景颜色
            font_path=font_path,  # 设置字体格式
            mask=mask,  # 设置背景图
            max_words=200,  # 最多显示词数
            max_font_size=200,  # 字体最大值
            scale=80  # 调整图片清晰度，值越大越清楚
        )
        wc.generate_from_frequencies(word_counts)  # 从字典生成词云
        image_colors = wordcloud.ImageColorGenerator(mask)  # 从背景图建立颜色方案
        wc.recolor(color_func=image_colors)  # 将词云颜色设置为背景图方案
        plt.figure()
        plt.imshow(wc)  # 显示词云
        plt.axis('off')  # 关闭坐标轴
        plt.show()  # 显示图像
        wc.to_file("bb.jpg")  # 将图片输出为文件

Пример #17

0

Показать файл

def cut(file, outfile):
    with open(file, mode='r', encoding="utf-8") as f:
        document = f.read()
        document_cut = jieba.cut(document)
        result = ' '.join(document_cut)
        with open(outfile, mode='w', encoding="utf-8") as outF:
            outF.write(result)

    print("文件已分词！")

Пример #18

0

Показать файл

Файл: yanwenzi.py Проект: mattzheng/py-yanwenzi

 def jieba_cut(self,text):
     '''
     2020-6-3 发现'İrem 艾丽' - ywz_replace 报错：IndexError: string index out of range
     '''
     try:
         text = self.ywz_replace(text)
     except:
         pass
     return list(jieba.cut(text))

Пример #19

0

Показать файл

def clean_data(file, outFIle):
    with open(file, mode="r", encoding="utf-8") as f:
        doc = f.read()
        # print(doc)
        doc_cut = jieba.cut(doc)
        # print(" ".join(doc_cut))
        res = " ".join(doc_cut)
    with open(outFIle, mode='w', encoding='utf-8') as f2:
        f2.write(res)

Пример #20

0

Показать файл

def my_wordcloud(filename):
    punct = str.maketrans("!.,:;-?※></()=，、。／[]《》", "                      ")
    plt.rcParams['font.sans-serif'] = 'PingFang TC'  # 設字型

    # 讀取停用字
    stop = [line.strip() for line in open('stopwords.txt').readlines()]

    print('停用字長度', len(stop))
    all_segs = []
    with open(filename) as file:
        for line in file:
            # print(line)
            line = line.translate(punct)
            segs = line.split(' ')
            for anyy in segs:
                if len(anyy.strip()) > 2:
                    all_segs.append(anyy.strip())
    # print(all_segs)
    print(len(all_segs))

    jieba.load_userdict('userdict.txt')

    word_appear_times = {}
    for i in all_segs:
        # print('-'*30)
        # print(i,':',list(jieba.cut(i,cut_all=False)))
        for anyy in list(jieba.cut(i, cut_all=True)):
            anyy = anyy.lower()
            if anyy not in stop and len(anyy.strip()) > 2:
                # print(anyy)
                if anyy not in word_appear_times:
                    word_appear_times[anyy] = 1
                else:
                    word_appear_times[anyy] += 1
            else:
                continue
        # print('-'*30,'\n')
        # time.sleep(3)
    # print(word_appear_times)

    word_appear_times_ordered = sorted(word_appear_times.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
    top150 = word_appear_times_ordered[0:150]
    # print(top150)
    top150_word = ' '.join([x[0] for x in top150])
    print(top150_word)

    cloud_mask = np.array(Image.open("cloud_mask.png"))
    wc = WordCloud(colormap='RdYlGn',
                   mask=cloud_mask,
                   max_words=150,
                   background_color="black",
                   scale=4,
                   font_path='/System/Library/Fonts/PingFang.ttc')  # 產生文字雲
    wc.generate(top150_word)
    wc.to_file(f'{filename[:-4]}.jpg')

Пример #21

0

Показать файл

def stop_word(line):
    data_line = line.strip()
    wordList = jieba_fast.cut(data_line)  # wordlist是一个生成器
    outStr = ''
    for word in wordList:
        if word not in stopword:
            outStr += word
            outStr += ' '
    lineOut = outStr.strip().encode('utf-8')
    return lineOut

Пример #22

0

Показать файл

def seg_sentence(sentence):
    sentence_seged = jieba.cut(sentence.strip())
    stopwords = stopwordslist('data/stopwords.txt')
    outstr = ''
    for word in sentence_seged:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += " "
    return outstr

Пример #23

0

Показать файл

Файл: wiki_train_gensim.py Проект: a625687551/wordembedding

def make_segment_file(file_path):
    print("start seg file")
    jieba.load_userdict("./seg_dict.txt")
    with open(file_path) as f:
        document = f.read()
    d_cut = jieba.cut(document)
    res = " ".join(d_cut)
    with open("./segment_wiki.txt", "w") as f:
        f.write(res)
    print("segment file ok")

Пример #24

0

Показать файл

def extract_keyword_from_prodname(prod_name,stopwords=stopwords_path):
    try:
        word_list = jieba.cut(prod_name, cut_all=True)
        stop_words_list = get_stopwords(stopwords)
        if stop_words_list:
            word_list = [word.strip() for word in word_list if word.strip() not in stop_words_list]
        word_list = ' '.join(word_list)
    except:
        word_list = 'None'
    return word_list

Пример #25

0

Показать файл

Файл: seg_word.py Проект: lxw273486636/news

def split(stop_lists, data):
    word_list = []
    seg_list = jieba.cut(data, cut_all=False)
    list_str = " ".join(seg_list)

    for word in list_str.split(" "):
        if not (word.strip().lower() in stop_lists) and len(word.strip()) > 1:
            word_list.append(word)
    write_file(
        "/Users/red/Desktop/temp/news/data/word/" + str(uuid.uuid4()) + ".txt",
        word_list)

Пример #26

0

Показать файл

Файл: server.py Проект: jinghaitingfeng/jieba_server

def get_topK(text, topK):
    text = jieba_fast.cut(text)
    result_str = []
    for word in text:
        if word not in stopwords:
            if word != '\t':
                result_str.append(word)
    count = Counter(result_str)
    topk_words = count.most_common(topK)
    per_data = build_info_list(topk_words)
    return per_data

Пример #27

0

Показать файл

 def __iter__(self):
     with open(ANT_NLP_FILE_PATH, "r", encoding="utf8") as atec:
         logging.info('generating word corpus, processing file %s',
                      ANT_NLP_FILE_PATH)
         for line in atec:
             line_code, s1, s2, label = line.strip().split("\t")
             s1 = utils.remove_punctuation(s1)
             s2 = utils.remove_punctuation(s2)
             yield list(jieba.cut(s1)) + list(jieba.cut(s2))
     for file in extract_wiki.list_all_files(PROCESSED_WIKI_FILE_PATH):
         logging.info('generating word corpus, processing file %s', file)
         with open(file, 'r', encoding="utf8") as wiki:
             for line in wiki:
                 line = utils.remove_punctuation(line)
                 if len(line) > 0:
                     # 汉字的unicode编码范围是[0x4E00,0x9FA5]
                     yield [
                         word for word in list(jieba.cut(line))
                         if word and 0x4E00 <= ord(word[0]) <= 0x9FA5
                     ]

Пример #28

0

Показать файл

def kmeans_spiltouttxtfile(needspiltfile,
                           spilttype=10,
                           stopwords=r"./stop_words_ch.txt"):
    f1 = open(needspiltfile, "r", encoding='utf-8', errors='ignore')
    middlespiltfilt = fh.get_path_file_subpath(
        needspiltfile) + "/" + fh.get_path_file_completebasename(
            needspiltfile) + "temp"
    f2 = open(middlespiltfilt, 'w', encoding='utf-8', errors='ignore')
    for line in f1:
        seg_list = jieba.cut(line, cut_all=False)
        w = (" ".join(seg_list)).replace("\t\t\t", "\t")
        f2.write(w)
        # print(w)
    f1.close()
    f2.close()
    #取需要分词的内容
    titles = open(middlespiltfilt, encoding='utf-8',
                  errors='ignore').read().split('\n')

    #查看内容，这里是一个list,list里面每个原素是分好的标题，查看下长度看有没有错误
    #titles
    #len(titles)
    #构建停词函数，停词表是自己在网上搜的
    def get_custom_stopwords(stop_words_file):
        with open(stop_words_file, encoding='utf-8') as f:
            stopwords = f.read()
        stopwords_list = stopwords.split('\n')
        custom_stopwords_list = [i for i in stopwords_list]
        return custom_stopwords_list

    #停用词函数调用
    stop_words_file = stopwords
    stopwords = get_custom_stopwords(stop_words_file)
    # print(stopwords)
    #查看停用词，也是list格式
    #stopwords
    #构建词向量，也就是把分好的次去除停词转化成kmeans可以接受的形式
    from sklearn.feature_extraction.text import CountVectorizer
    count_vec = CountVectorizer(stop_words=stopwords)
    km_matrix = count_vec.fit_transform(titles)
    # print(km_matrix.shape)
    #查看词向量
    #print(km_matrix.toarray())
    #开始聚类啦
    from sklearn.cluster import KMeans
    num_clusters = spilttype  #聚为四类，可根据需要修改
    km = KMeans(n_clusters=num_clusters)
    km.fit(km_matrix)
    clusters = km.labels_.tolist()
    #查看聚类的结果，是list,这里省略，看看长度是不是和title一样就行啦
    #len(clusters)
    #最后把聚类结果写在一个新的txt里面
    return clusters

Пример #29

0

Показать файл

def textSplit(content):
    dic = {}
    splitedText = ""
    seqlist = jieba_fast.cut(content)
    for word in seqlist:
        splitedText = splitedText + word + " "
        if word not in dic:
            dic[word] = 1
        else:
            dic[word] += 1
    wordCloud(splitedText)
    saveJson(dic, "报告全文")
    saveSheet(dic, "报告全文")

Пример #30

0

Показать файл

def word_seg(input_file, output_file, mode):
    if mode == 'word':
        jieba.load_userdict(dict_path)

    with open(output_file, 'w') as f, open(input_file, 'r') as fi:
        for l in fi:
            # remove all whitespace characters
            l = ''.join(l.split())
            if mode == 'char':
                f.write(' '.join(list(l)) + '\n')
            else:
                seg = jieba.cut(l, cut_all=False)
                f.write(' '.join(seg) + '\n')

Пример #31

0

Показать файл

Файл: jieba_fast_demo_01.py Проект: gituser9527/my_learning_codes

import jieba_fast as jieba

c = '小明硕士毕业于中国科学院计算所，后在日本京都大学深造'

print(" ".join(jieba.cut(c, cut_all = True)))
print(" ".join(jieba.cut(c, cut_all = False)))
print(" ".join(jieba.cut_for_search(c)))

Пример #32

0

Показать файл

Файл: kw_hit_demo.py Проект: gituser9527/my_learning_codes

import csv
import jieba_fast

kw = '把字刻在石头上'
print(' '.join(jieba_fast.cut(kw)))
print(' '.join(jieba_fast.cut_for_search(kw)))

Python cut примеры использования