Python cut_words 예제들, utils.cut_words Python 예제들

예제 #1

0

파일 보기

파일: feature.py 프로젝트: yuanhr/info_consume

    def extract_keyword(items):
        keywords_weight = dict()
        for item in items:
            title = item['title']
            content = item['content']

            title_terms = cut_words(title)
            content_terms = cut_words(content)

            for term in title_terms:
                try:
                    keywords_weight[term] += title_term_weight
                except KeyError:
                    keywords_weight[term] = title_term_weight

            for term in content_terms:
                try:
                    keywords_weight[term] += content_term_weight
                except KeyError:
                    keywords_weight[term] = content_term_weight

        # 筛掉频率大于或等于0.8, 频数小于或等于3的词
        keywords_count = dict()
        total_weight = sum(keywords_weight.values())
        for keyword, weight in keywords_weight.iteritems():
            ratio = float(weight) / float(total_weight)
            if ratio >= 0.8 or weight <= 3:
                continue

            keywords_count[keyword] = weight

        return keywords_count

예제 #2

0

파일 보기

파일: comment_clustering_tfidf_v2.py 프로젝트: jimmy0000/opinion_news

def freq_word_evaluation_half(items, topk=10, topk_weight=5):
    '''
    选取权值排在topk_weight的评论
    input：
        items:
            新闻组成的列表:字典的序列, 数据示例：[{'_id':新闻id,'content':新闻内容,'lable':类别标签,'weight':每条评论属于该类的权值},...]

    output：
        权值排在前一半的评论，数据示例：[{'_id':新闻id,'content':新闻内容,'lable':类别标签,'weight':每条评论属于该类的权值},...]
    '''
    words_list = []
    #评论按照权值大小降序排列
    idx = 0
    weight_dict = {}
    for item in items:
        weight_dict[idx] = item['weight']

    sorted_weight = sorted(weight_dict.iteritems(),key = lambda asd:asd[1],reverse=True)
    result_weight = sorted_weight[:topk_weight]

    half_item = []
    for r in result_weight:
        half_item.append(items[int(r[0])])

    for item in half_item:
        text = item['content']
        words = cut_words(text)
        words_list.extend(words)

    counter = Counter(words_list)
    total_weight = sum(dict(counter.most_common()).values())
    topk_words = counter.most_common(topk)
    keywords_dict = {k: v for k, v in topk_words}

    return keywords_dict, total_weight

예제 #3

0

파일 보기

파일: clustering.py 프로젝트: lijiahong/opinion_news

def freq_word(items, topk=20):
    '''
    统计一批文本的topk高频词
    input：
        items:
            新闻组成的列表:字典的序列, 数据示例：[{'_id':新闻id,'source_from_name':新闻来源,'title':新闻标题,'content':新闻内容,'timestamp':时间戳,'lable':类别标签},...]
        topk:
            按照词频的前多少个词, 默认取20
    output：
        topk_words: 词、词频组成的列表, 数据示例：[(词，词频)，(词，词频)...]
    '''
    from utils import cut_words
    from collections import Counter
    words_list = []
    for item in items:
        text = item['title'] + item['content']
        words = cut_words(text)
        words_list.extend(words)

    counter = Counter(words_list)
    total_weight = sum(dict(counter.most_common()).values())
    topk_words = counter.most_common(topk)
    keywords_dict = {k: v for k, v in topk_words}

    return keywords_dict, total_weight

예제 #4

0

파일 보기

파일: clustering.py 프로젝트: lijiahong/opinion_news

def freq_word(items, topk=20):
    '''
    统计一批文本的topk高频词
    input：
        items:
            新闻组成的列表:字典的序列, 数据示例：[{'_id':新闻id,'source_from_name':新闻来源,'title':新闻标题,'content':新闻内容,'timestamp':时间戳,'lable':类别标签},...]
        topk:
            按照词频的前多少个词, 默认取20
    output：
        topk_words: 词、词频组成的列表, 数据示例：[(词，词频)，(词，词频)...]
    '''
    from utils import cut_words
    from collections import Counter
    words_list = []
    for item in items:
        text = item['title'] + item['content']
        words = cut_words(text)
        words_list.extend(words)

    counter = Counter(words_list)
    total_weight = sum(dict(counter.most_common()).values())
    topk_words = counter.most_common(topk)
    keywords_dict = {k: v for k, v in topk_words}

    return keywords_dict, total_weight

예제 #5

0

파일 보기

def word_bag(word, inputs, gram):
    '''
    取出一个词前三个词和后三个词，包括动词、名词、形容词
    输入数据：
    word:选取出的特征词
    inputs:过滤后的评论文本
    gram:取特征词前面gram个词和后面gram个词
    输出数据：
    counter_dict:{特征词：counter(在每个维度上特征值)}
    '''
    #一个词与前后三个词构成词袋
    words_bag = []
    counter_dict = {}
    for w in word:
        for input in inputs:
            if w[0] in input['content']:
                text = input['content']
                words = cut_words(text)
                if w[0] in words:
                    index = words.index(w[0])
                    if index - gram < 0:
                        bag = words[:index]
                    else:
                        bag = words[index - gram:index]
                    bag.extend(words[index:index + gram])
                    words_bag.extend(bag)
            counter = Counter(words_bag)
            top_words = counter.most_common()
            counter_dict[w] = {k: v for k, v in top_words}

    #特征词列表
    feature_list = list(set(words_bag))

    return counter_dict, feature_list

예제 #6

0

파일 보기

파일: weibo_clustering.py 프로젝트: jimmy0000/opinion_news

def word_bag(word, inputs, gram):
    '''
    取出一个词前三个词和后三个词，包括动词、名词、形容词
    输入数据：
    word:选取出的特征词
    inputs:过滤后的评论文本
    gram:取特征词前面gram个词和后面gram个词
    输出数据：
    counter_dict:{特征词：counter(在每个维度上特征值)}
    '''
    #一个词与前后三个词构成词袋
    words_bag = []
    counter_dict = {}
    for w in word:
        for input in inputs:
            if w[0] in input['content']:
                text = input['content']
                words = cut_words(text)
                if w[0] in words:
                    index = words.index(w[0])
                    if index-gram<0:
                        bag = words[:index]
                    else:
                        bag = words[index-gram:index]
                    bag.extend(words[index:index+gram])
                    words_bag.extend(bag)
            counter = Counter(words_bag)
            top_words = counter.most_common()
            counter_dict[w] = {k:v for k,v in top_words}

    #特征词列表
    feature_list = list(set(words_bag))

    return counter_dict,feature_list

예제 #7

0

파일 보기

def filter_comment(inputs):
    """
    针对一条新闻下的一组评论进行过滤
    过滤评论函数：将评论中@及后面用户名、表情符号去掉，只保留名词、动词、形容词次数大于3的评论;如果评论中的名词在高频（新闻+评论）词中出现过则保留该条评论，否则删除掉
    输入数据:
        inputs:评论数据，示例：[{'_id':评论id,'news_id':新闻id,'content':评论内容}]
        news:新闻数据，"新闻"
    输出数据:
        过滤后的评论数据
    """
    for r in inputs:
        news_content = r['news_content']

    item_reserved = []
    item_rubbish = []

    at_pattern = r'@(.+?)\s'
    emotion_pattern = r'\[(\S+?)\]'

    for input in inputs:
        rub_label = 0  # 表示不是垃圾
        text = re.sub(at_pattern, '',
                      input['content'] + ' ')  #在每个input后加一个空格，以去掉@在末尾的情况
        text = text.strip(' ')
        text = re.sub(emotion_pattern, '', text)
        words = cut_words(text)
        if len(words) >= 3 and len(words) <= 20:
            for word in words:
                if word in market_words:
                    rub_label = 1  # 表示命中广告词，是垃圾
                    input['rub_label'] = rub_label
                    item_rubbish.append(input)
                    break

            if rub_label == 0:
                input['content'] = text
                item_reserved.append(input)
        else:
            rub_label = 1
            input['rub_label'] = rub_label
            item_rubbish.append(input)

    # 如果评论中的名词出现在过新闻中，则保留该评论
    comment_top, comment_noun = freq_word_comment(item_reserved)  # 评论中的词及词频
    news_word = freq_word_news(news_content)  # 新闻中的词及词频
    imp_word = word_list(comment_top, news_word)  # 评论和新闻中的词及词频结合
    results = comment_word_in_news(comment_noun, imp_word, item_reserved)

    return results + item_rubbish

예제 #8

0

파일 보기

파일: weibo_clustering.py 프로젝트: jimmy0000/opinion_news

def filter_comment(inputs):
    """
    针对一条新闻下的一组评论进行过滤
    过滤评论函数：将评论中@及后面用户名、表情符号去掉，只保留名词、动词、形容词次数大于3的评论;如果评论中的名词在高频（新闻+评论）词中出现过则保留该条评论，否则删除掉
    输入数据:
        inputs:评论数据，示例：[{'_id':评论id,'news_id':新闻id,'content':评论内容}]
        news:新闻数据，"新闻"
    输出数据:
        过滤后的评论数据
    """
    for r in inputs:
        news_content = r['news_content']

    item_reserved = []
    item_rubbish = []

    at_pattern = r'@(.+?)\s'
    emotion_pattern = r'\[(\S+?)\]'

    for input in inputs:
        rub_label = 0 # 表示不是垃圾
        text = re.sub(at_pattern, '',input['content']+' ')#在每个input后加一个空格，以去掉@在末尾的情况
        text = text.strip(' ')
        text = re.sub(emotion_pattern,'',text)
        words = cut_words(text)
        if len(words) >= 3 and len(words)<=20:
            for word in words:
                if word in market_words:
                    rub_label = 1 # 表示命中广告词，是垃圾
                    input['rub_label'] = rub_label
                    item_rubbish.append(input)
                    break

            if rub_label == 0:
                input['content'] = text
                item_reserved.append(input)
        else:
            rub_label = 1
            input['rub_label'] = rub_label
            item_rubbish.append(input)

    # 如果评论中的名词出现在过新闻中，则保留该评论
    comment_top, comment_noun = freq_word_comment(item_reserved) # 评论中的词及词频
    news_word = freq_word_news(news_content) # 新闻中的词及词频
    imp_word = word_list(comment_top,news_word) # 评论和新闻中的词及词频结合
    results = comment_word_in_news(comment_noun, imp_word, item_reserved)

    return results + item_rubbish

예제 #9

0

파일 보기

파일: clustering.py 프로젝트: jimmy0000/opinion_news

def process_for_cluto(inputs, cluto_input_folder=None):
    """
    数据预处理函数
    input：
        inputs: 新闻数据, 示例：[{'_id':新闻id,'source_from_name':新闻来源,'title':新闻标题,'content':新闻内容,'timestamp':时间戳}]
    output:
        cluto输入文件路径
    """
    # handle default
    if not cluto_input_folder:
        cluto_input_folder = os.path.join(AB_PATH, CLUTO_FOLDER)

    feature_set = set() # 不重复的词集合
    words_list = [] # 所有新闻分词结果集合
    for input in inputs:
        text = input['title'] + input['content']
        words = cut_words(text)
        words_list.append(words)

    # 特征词字典
    dictionary = corpora.Dictionary(words_list)

    # 将feature中的词转换成列表
    feature_set = set(dictionary.keys())

    row_count = len(inputs) # documents count
    column_count = len(feature_set) # feature count
    nonzero_count = 0 # nonzero elements count

    # 文件名以PID命名
    if not os.path.exists(cluto_input_folder):
        os.makedirs(cluto_input_folder)
    file_name = os.path.join(cluto_input_folder, '%s.txt' % os.getpid())

    with open(file_name, 'w') as fw:
        lines = []

        for words in words_list:
            bow = dictionary.doc2bow(words)
            nonzero_count += len(bow)
            line = ' '.join(['%s %s' % (w + 1, c) for w, c in bow]) + '\n'
            lines.append(line)

        fw.write('%s %s %s\n' % (row_count, column_count, nonzero_count))
        fw.writelines(lines)

    return file_name

예제 #10

0

파일 보기

파일: clustering.py 프로젝트: yuanhuiru/xnr2

def process_for_cluto(inputs, cluto_input_folder=None):
    """
    数据预处理函数
    input：
        inputs: 新闻数据, 示例：[{'_id':新闻id,'source_from_name':新闻来源,'title':新闻标题,'content':新闻内容,'timestamp':时间戳}]
    output:
        cluto输入文件路径
    """
    # handle default
    if not cluto_input_folder:
        cluto_input_folder = os.path.join(AB_PATH, CLUTO_FOLDER)

    feature_set = set()  # 不重复的词集合
    words_list = []  # 所有新闻分词结果集合
    for input in inputs:
        text = input['title'] + input['content']
        words = cut_words(text)
        words_list.append(words)

    # 特征词字典
    dictionary = corpora.Dictionary(words_list)

    # 将feature中的词转换成列表
    feature_set = set(dictionary.keys())

    row_count = len(inputs)  # documents count
    column_count = len(feature_set)  # feature count
    nonzero_count = 0  # nonzero elements count

    # 文件名以PID命名
    if not os.path.exists(cluto_input_folder):
        os.makedirs(cluto_input_folder)
    file_name = os.path.join(cluto_input_folder, '%s.txt' % os.getpid())

    with open(file_name, 'w') as fw:
        lines = []

        for words in words_list:
            bow = dictionary.doc2bow(words)
            nonzero_count += len(bow)
            line = ' '.join(['%s %s' % (w + 1, c) for w, c in bow]) + '\n'
            lines.append(line)

        fw.write('%s %s %s\n' % (row_count, column_count, nonzero_count))
        fw.writelines(lines)

    return file_name

예제 #11

0

파일 보기

파일: comment_clustering_tfidf_v2.py 프로젝트: jimmy0000/opinion_news

def freq_word(items):
    '''
    统计一条文本的词频
    input：
        items:
            新闻组成的列表:字典, 数据示例：{'_id':评论id,'news_id':新闻id,'content':新闻内容}
    output：
        top_word: 词和词频构成的字典, 数据示例：{词：词频，词：词频，...}
    '''
    words_list = []
    text = items['content']
    words = cut_words(text)
    for w in words:
        words_list.append(w)

    counter = Counter(words_list)
    total = sum(counter.values())#总词频数
    topk_words = counter.most_common()
    top_word = {k:(float(v)/float(total)) for k,v in topk_words}

    return top_word

예제 #12

0

파일 보기

파일: comment_clustering_tfidf_v2.py 프로젝트: lijiahong/opinion_news

def freq_word(items):
    '''
    统计一条文本的词频
    input：
        items:
            新闻组成的列表:字典, 数据示例：{'_id':评论id,'news_id':新闻id,'content':新闻内容}
    output：
        top_word: 词和词频构成的字典, 数据示例：{词：词频，词：词频，...}
    '''
    words_list = []
    text = items['content']
    words = cut_words(text)
    for w in words:
        words_list.append(w)

    counter = Counter(words_list)
    total = sum(counter.values())  #总词频数
    topk_words = counter.most_common()
    top_word = {k: (float(v) / float(total)) for k, v in topk_words}

    return top_word

예제 #13

0

파일 보기

파일: comment_clustering_tfidf_v2.py 프로젝트: lijiahong/opinion_news

def freq_word_evaluation(items, topk=10):
    '''
    聚类评价用，统计一类文本的topk高频词
    input：
        items:
            新闻组成的列表:字典的序列, 数据示例：[{'_id':新闻id,'content':新闻内容,'lable':类别标签},...]
        topk:
            按照词频的前多少个词, 默认取10
    output：
        topk_words: 词、词频组成的列表, 数据示例：[(词，词频)，(词，词频)...]
    '''
    words_list = []
    for item in items:
        text = item['content']
        words = cut_words(text)
        words_list.extend(words)

    counter = Counter(words_list)
    total_weight = sum(dict(counter.most_common()).values())
    topk_words = counter.most_common(topk)
    keywords_dict = {k: v for k, v in topk_words}

    return keywords_dict, total_weight

예제 #14

0

파일 보기

파일: comment_clustering_tfidf_v2.py 프로젝트: jimmy0000/opinion_news

def freq_word_evaluation(items, topk=10):
    '''
    聚类评价用，统计一类文本的topk高频词
    input：
        items:
            新闻组成的列表:字典的序列, 数据示例：[{'_id':新闻id,'content':新闻内容,'lable':类别标签},...]
        topk:
            按照词频的前多少个词, 默认取10
    output：
        topk_words: 词、词频组成的列表, 数据示例：[(词，词频)，(词，词频)...]
    '''
    words_list = []
    for item in items:
        text = item['content']
        words = cut_words(text)
        words_list.extend(words)

    counter = Counter(words_list)
    total_weight = sum(dict(counter.most_common()).values())
    topk_words = counter.most_common(topk)
    keywords_dict = {k: v for k, v in topk_words}

    return keywords_dict, total_weight

예제 #15

0

파일 보기

파일: comment_clustering_tfidf_v2.py 프로젝트: lijiahong/opinion_news

def freq_word_evaluation_half(items, topk=10, topk_weight=5):
    '''
    选取权值排在topk_weight的评论
    input：
        items:
            新闻组成的列表:字典的序列, 数据示例：[{'_id':新闻id,'content':新闻内容,'lable':类别标签,'weight':每条评论属于该类的权值},...]

    output：
        权值排在前一半的评论，数据示例：[{'_id':新闻id,'content':新闻内容,'lable':类别标签,'weight':每条评论属于该类的权值},...]
    '''
    words_list = []
    #评论按照权值大小降序排列
    idx = 0
    weight_dict = {}
    for item in items:
        weight_dict[idx] = item['weight']

    sorted_weight = sorted(weight_dict.iteritems(),
                           key=lambda asd: asd[1],
                           reverse=True)
    result_weight = sorted_weight[:topk_weight]

    half_item = []
    for r in result_weight:
        half_item.append(items[int(r[0])])

    for item in half_item:
        text = item['content']
        words = cut_words(text)
        words_list.extend(words)

    counter = Counter(words_list)
    total_weight = sum(dict(counter.most_common()).values())
    topk_words = counter.most_common(topk)
    keywords_dict = {k: v for k, v in topk_words}

    return keywords_dict, total_weight

예제 #16

0

파일 보기

def ad_filter(item, market_words=market_words):
    """
        按照简单规则过滤评论函数：将评论中@及后面用户名、表情符号去掉，只保留名词、动词、形容词词数大于3的评论
        input:
            item:评论数据，示例：{'_id':评论id,'content':评论内容}
        output:
            过滤后的评论数据, 增加了ad_label，0表示非垃圾; 增加了text_filter_ad，表示去除无用东西的文本
    """
    text = item['content']
    text = remove_at(text)
    text = remove_emoticon(text)
    words = cut_words(text)

    ad_label = 0  # 默认每条记录都不是垃圾
    if len(words) >= 3 and len(words) <= 20:
        if len(set(words) & set(market_words)):
            ad_label = 1
    else:
        ad_label = 1

    item['ad_label'] = ad_label
    item['text_filter_ad'] = text

    return item

예제 #17

0

파일 보기

파일: ad_filter.py 프로젝트: jimmy0000/opinion_news

def ad_filter(item, market_words=market_words):
    """
        按照简单规则过滤评论函数：将评论中@及后面用户名、表情符号去掉，只保留名词、动词、形容词词数大于3的评论
        input:
            item:评论数据，示例：{'_id':评论id,'content':评论内容}
        output:
            过滤后的评论数据, 增加了ad_label，0表示非垃圾; 增加了text_filter_ad，表示去除无用东西的文本
    """
    text = item['content']
    text = remove_at(text)
    text = remove_emoticon(text)
    words = cut_words(text)

    ad_label = 0 # 默认每条记录都不是垃圾
    if len(words) >= 3 and len(words)<=20:
        if len(set(words) & set(market_words)):
            ad_label = 1
    else:
        ad_label = 1

    item['ad_label'] = ad_label
    item['text_filter_ad'] = text

    return item