예제 #1
0
    def extract_keyword(items):
        keywords_weight = dict()
        for item in items:
            title = item['title']
            content = item['content']

            title_terms = cut_words(title)
            content_terms = cut_words(content)

            for term in title_terms:
                try:
                    keywords_weight[term] += title_term_weight
                except KeyError:
                    keywords_weight[term] = title_term_weight

            for term in content_terms:
                try:
                    keywords_weight[term] += content_term_weight
                except KeyError:
                    keywords_weight[term] = content_term_weight

        # 筛掉频率大于或等于0.8, 频数小于或等于3的词
        keywords_count = dict()
        total_weight = sum(keywords_weight.values())
        for keyword, weight in keywords_weight.iteritems():
            ratio = float(weight) / float(total_weight)
            if ratio >= 0.8 or weight <= 3:
                continue

            keywords_count[keyword] = weight

        return keywords_count
def freq_word_evaluation_half(items, topk=10, topk_weight=5):
    '''
    选取权值排在topk_weight的评论
    input:
        items:
            新闻组成的列表:字典的序列, 数据示例:[{'_id':新闻id,'content':新闻内容,'lable':类别标签,'weight':每条评论属于该类的权值},...]

    output:
        权值排在前一半的评论,数据示例:[{'_id':新闻id,'content':新闻内容,'lable':类别标签,'weight':每条评论属于该类的权值},...]
    '''
    words_list = []
    #评论按照权值大小降序排列
    idx = 0
    weight_dict = {}
    for item in items:
        weight_dict[idx] = item['weight']

    sorted_weight = sorted(weight_dict.iteritems(),key = lambda asd:asd[1],reverse=True)
    result_weight = sorted_weight[:topk_weight]

    half_item = []
    for r in result_weight:
        half_item.append(items[int(r[0])])

    for item in half_item:
        text = item['content']
        words = cut_words(text)
        words_list.extend(words)

    counter = Counter(words_list)
    total_weight = sum(dict(counter.most_common()).values())
    topk_words = counter.most_common(topk)
    keywords_dict = {k: v for k, v in topk_words}

    return keywords_dict, total_weight
예제 #3
0
def freq_word(items, topk=20):
    '''
    统计一批文本的topk高频词
    input:
        items:
            新闻组成的列表:字典的序列, 数据示例:[{'_id':新闻id,'source_from_name':新闻来源,'title':新闻标题,'content':新闻内容,'timestamp':时间戳,'lable':类别标签},...]
        topk:
            按照词频的前多少个词, 默认取20
    output:
        topk_words: 词、词频组成的列表, 数据示例:[(词,词频),(词,词频)...]
    '''
    from utils import cut_words
    from collections import Counter
    words_list = []
    for item in items:
        text = item['title'] + item['content']
        words = cut_words(text)
        words_list.extend(words)

    counter = Counter(words_list)
    total_weight = sum(dict(counter.most_common()).values())
    topk_words = counter.most_common(topk)
    keywords_dict = {k: v for k, v in topk_words}

    return keywords_dict, total_weight
예제 #4
0
def freq_word(items, topk=20):
    '''
    统计一批文本的topk高频词
    input:
        items:
            新闻组成的列表:字典的序列, 数据示例:[{'_id':新闻id,'source_from_name':新闻来源,'title':新闻标题,'content':新闻内容,'timestamp':时间戳,'lable':类别标签},...]
        topk:
            按照词频的前多少个词, 默认取20
    output:
        topk_words: 词、词频组成的列表, 数据示例:[(词,词频),(词,词频)...]
    '''
    from utils import cut_words
    from collections import Counter
    words_list = []
    for item in items:
        text = item['title'] + item['content']
        words = cut_words(text)
        words_list.extend(words)

    counter = Counter(words_list)
    total_weight = sum(dict(counter.most_common()).values())
    topk_words = counter.most_common(topk)
    keywords_dict = {k: v for k, v in topk_words}

    return keywords_dict, total_weight
예제 #5
0
def word_bag(word, inputs, gram):
    '''
    取出一个词前三个词和后三个词,包括动词、名词、形容词
    输入数据:
    word:选取出的特征词
    inputs:过滤后的评论文本
    gram:取特征词前面gram个词和后面gram个词
    输出数据:
    counter_dict:{特征词:counter(在每个维度上特征值)}
    '''
    #一个词与前后三个词构成词袋
    words_bag = []
    counter_dict = {}
    for w in word:
        for input in inputs:
            if w[0] in input['content']:
                text = input['content']
                words = cut_words(text)
                if w[0] in words:
                    index = words.index(w[0])
                    if index - gram < 0:
                        bag = words[:index]
                    else:
                        bag = words[index - gram:index]
                    bag.extend(words[index:index + gram])
                    words_bag.extend(bag)
            counter = Counter(words_bag)
            top_words = counter.most_common()
            counter_dict[w] = {k: v for k, v in top_words}

    #特征词列表
    feature_list = list(set(words_bag))

    return counter_dict, feature_list
예제 #6
0
def word_bag(word, inputs, gram):
    '''
    取出一个词前三个词和后三个词,包括动词、名词、形容词
    输入数据:
    word:选取出的特征词
    inputs:过滤后的评论文本
    gram:取特征词前面gram个词和后面gram个词
    输出数据:
    counter_dict:{特征词:counter(在每个维度上特征值)}
    '''
    #一个词与前后三个词构成词袋
    words_bag = []
    counter_dict = {}
    for w in word:
        for input in inputs:
            if w[0] in input['content']:
                text = input['content']
                words = cut_words(text)
                if w[0] in words:
                    index = words.index(w[0])
                    if index-gram<0:
                        bag = words[:index]
                    else:
                        bag = words[index-gram:index]
                    bag.extend(words[index:index+gram])
                    words_bag.extend(bag)
            counter = Counter(words_bag)
            top_words = counter.most_common()
            counter_dict[w] = {k:v for k,v in top_words}

    #特征词列表
    feature_list = list(set(words_bag))

    return counter_dict,feature_list
예제 #7
0
def filter_comment(inputs):
    """
    针对一条新闻下的一组评论进行过滤
    过滤评论函数:将评论中@及后面用户名、表情符号去掉,只保留名词、动词、形容词次数大于3的评论;如果评论中的名词在高频(新闻+评论)词中出现过则保留该条评论,否则删除掉
    输入数据:
        inputs:评论数据,示例:[{'_id':评论id,'news_id':新闻id,'content':评论内容}]
        news:新闻数据,"新闻"
    输出数据:
        过滤后的评论数据
    """
    for r in inputs:
        news_content = r['news_content']

    item_reserved = []
    item_rubbish = []

    at_pattern = r'@(.+?)\s'
    emotion_pattern = r'\[(\S+?)\]'

    for input in inputs:
        rub_label = 0  # 表示不是垃圾
        text = re.sub(at_pattern, '',
                      input['content'] + ' ')  #在每个input后加一个空格,以去掉@在末尾的情况
        text = text.strip(' ')
        text = re.sub(emotion_pattern, '', text)
        words = cut_words(text)
        if len(words) >= 3 and len(words) <= 20:
            for word in words:
                if word in market_words:
                    rub_label = 1  # 表示命中广告词,是垃圾
                    input['rub_label'] = rub_label
                    item_rubbish.append(input)
                    break

            if rub_label == 0:
                input['content'] = text
                item_reserved.append(input)
        else:
            rub_label = 1
            input['rub_label'] = rub_label
            item_rubbish.append(input)

    # 如果评论中的名词出现在过新闻中,则保留该评论
    comment_top, comment_noun = freq_word_comment(item_reserved)  # 评论中的词及词频
    news_word = freq_word_news(news_content)  # 新闻中的词及词频
    imp_word = word_list(comment_top, news_word)  # 评论和新闻中的词及词频结合
    results = comment_word_in_news(comment_noun, imp_word, item_reserved)

    return results + item_rubbish
예제 #8
0
def filter_comment(inputs):
    """
    针对一条新闻下的一组评论进行过滤
    过滤评论函数:将评论中@及后面用户名、表情符号去掉,只保留名词、动词、形容词次数大于3的评论;如果评论中的名词在高频(新闻+评论)词中出现过则保留该条评论,否则删除掉
    输入数据:
        inputs:评论数据,示例:[{'_id':评论id,'news_id':新闻id,'content':评论内容}]
        news:新闻数据,"新闻"
    输出数据:
        过滤后的评论数据
    """
    for r in inputs:
        news_content = r['news_content']

    item_reserved = []
    item_rubbish = []

    at_pattern = r'@(.+?)\s'
    emotion_pattern = r'\[(\S+?)\]'

    for input in inputs:
        rub_label = 0 # 表示不是垃圾
        text = re.sub(at_pattern, '',input['content']+' ')#在每个input后加一个空格,以去掉@在末尾的情况
        text = text.strip(' ')
        text = re.sub(emotion_pattern,'',text)
        words = cut_words(text)
        if len(words) >= 3 and len(words)<=20:
            for word in words:
                if word in market_words:
                    rub_label = 1 # 表示命中广告词,是垃圾
                    input['rub_label'] = rub_label
                    item_rubbish.append(input)
                    break

            if rub_label == 0:
                input['content'] = text
                item_reserved.append(input)
        else:
            rub_label = 1
            input['rub_label'] = rub_label
            item_rubbish.append(input)

    # 如果评论中的名词出现在过新闻中,则保留该评论
    comment_top, comment_noun = freq_word_comment(item_reserved) # 评论中的词及词频
    news_word = freq_word_news(news_content) # 新闻中的词及词频
    imp_word = word_list(comment_top,news_word) # 评论和新闻中的词及词频结合
    results = comment_word_in_news(comment_noun, imp_word, item_reserved)

    return results + item_rubbish
예제 #9
0
def process_for_cluto(inputs, cluto_input_folder=None):
    """
    数据预处理函数
    input:
        inputs: 新闻数据, 示例:[{'_id':新闻id,'source_from_name':新闻来源,'title':新闻标题,'content':新闻内容,'timestamp':时间戳}]
    output:
        cluto输入文件路径
    """
    # handle default
    if not cluto_input_folder:
        cluto_input_folder = os.path.join(AB_PATH, CLUTO_FOLDER)

    feature_set = set() # 不重复的词集合
    words_list = [] # 所有新闻分词结果集合
    for input in inputs:
        text = input['title'] + input['content']
        words = cut_words(text)
        words_list.append(words)

    # 特征词字典
    dictionary = corpora.Dictionary(words_list)

    # 将feature中的词转换成列表
    feature_set = set(dictionary.keys())

    row_count = len(inputs) # documents count
    column_count = len(feature_set) # feature count
    nonzero_count = 0 # nonzero elements count

    # 文件名以PID命名
    if not os.path.exists(cluto_input_folder):
        os.makedirs(cluto_input_folder)
    file_name = os.path.join(cluto_input_folder, '%s.txt' % os.getpid())

    with open(file_name, 'w') as fw:
        lines = []

        for words in words_list:
            bow = dictionary.doc2bow(words)
            nonzero_count += len(bow)
            line = ' '.join(['%s %s' % (w + 1, c) for w, c in bow]) + '\n'
            lines.append(line)

        fw.write('%s %s %s\n' % (row_count, column_count, nonzero_count))
        fw.writelines(lines)

    return file_name
예제 #10
0
def process_for_cluto(inputs, cluto_input_folder=None):
    """
    数据预处理函数
    input:
        inputs: 新闻数据, 示例:[{'_id':新闻id,'source_from_name':新闻来源,'title':新闻标题,'content':新闻内容,'timestamp':时间戳}]
    output:
        cluto输入文件路径
    """
    # handle default
    if not cluto_input_folder:
        cluto_input_folder = os.path.join(AB_PATH, CLUTO_FOLDER)

    feature_set = set()  # 不重复的词集合
    words_list = []  # 所有新闻分词结果集合
    for input in inputs:
        text = input['title'] + input['content']
        words = cut_words(text)
        words_list.append(words)

    # 特征词字典
    dictionary = corpora.Dictionary(words_list)

    # 将feature中的词转换成列表
    feature_set = set(dictionary.keys())

    row_count = len(inputs)  # documents count
    column_count = len(feature_set)  # feature count
    nonzero_count = 0  # nonzero elements count

    # 文件名以PID命名
    if not os.path.exists(cluto_input_folder):
        os.makedirs(cluto_input_folder)
    file_name = os.path.join(cluto_input_folder, '%s.txt' % os.getpid())

    with open(file_name, 'w') as fw:
        lines = []

        for words in words_list:
            bow = dictionary.doc2bow(words)
            nonzero_count += len(bow)
            line = ' '.join(['%s %s' % (w + 1, c) for w, c in bow]) + '\n'
            lines.append(line)

        fw.write('%s %s %s\n' % (row_count, column_count, nonzero_count))
        fw.writelines(lines)

    return file_name
def freq_word(items):
    '''
    统计一条文本的词频
    input:
        items:
            新闻组成的列表:字典, 数据示例:{'_id':评论id,'news_id':新闻id,'content':新闻内容}
    output:
        top_word: 词和词频构成的字典, 数据示例:{词:词频,词:词频,...}
    '''
    words_list = []
    text = items['content']
    words = cut_words(text)
    for w in words:
        words_list.append(w)

    counter = Counter(words_list)
    total = sum(counter.values())#总词频数
    topk_words = counter.most_common()
    top_word = {k:(float(v)/float(total)) for k,v in topk_words}

    return top_word
def freq_word(items):
    '''
    统计一条文本的词频
    input:
        items:
            新闻组成的列表:字典, 数据示例:{'_id':评论id,'news_id':新闻id,'content':新闻内容}
    output:
        top_word: 词和词频构成的字典, 数据示例:{词:词频,词:词频,...}
    '''
    words_list = []
    text = items['content']
    words = cut_words(text)
    for w in words:
        words_list.append(w)

    counter = Counter(words_list)
    total = sum(counter.values())  #总词频数
    topk_words = counter.most_common()
    top_word = {k: (float(v) / float(total)) for k, v in topk_words}

    return top_word
def freq_word_evaluation(items, topk=10):
    '''
    聚类评价用,统计一类文本的topk高频词
    input:
        items:
            新闻组成的列表:字典的序列, 数据示例:[{'_id':新闻id,'content':新闻内容,'lable':类别标签},...]
        topk:
            按照词频的前多少个词, 默认取10
    output:
        topk_words: 词、词频组成的列表, 数据示例:[(词,词频),(词,词频)...]
    '''
    words_list = []
    for item in items:
        text = item['content']
        words = cut_words(text)
        words_list.extend(words)

    counter = Counter(words_list)
    total_weight = sum(dict(counter.most_common()).values())
    topk_words = counter.most_common(topk)
    keywords_dict = {k: v for k, v in topk_words}

    return keywords_dict, total_weight
def freq_word_evaluation(items, topk=10):
    '''
    聚类评价用,统计一类文本的topk高频词
    input:
        items:
            新闻组成的列表:字典的序列, 数据示例:[{'_id':新闻id,'content':新闻内容,'lable':类别标签},...]
        topk:
            按照词频的前多少个词, 默认取10
    output:
        topk_words: 词、词频组成的列表, 数据示例:[(词,词频),(词,词频)...]
    '''
    words_list = []
    for item in items:
        text = item['content']
        words = cut_words(text)
        words_list.extend(words)

    counter = Counter(words_list)
    total_weight = sum(dict(counter.most_common()).values())
    topk_words = counter.most_common(topk)
    keywords_dict = {k: v for k, v in topk_words}

    return keywords_dict, total_weight
def freq_word_evaluation_half(items, topk=10, topk_weight=5):
    '''
    选取权值排在topk_weight的评论
    input:
        items:
            新闻组成的列表:字典的序列, 数据示例:[{'_id':新闻id,'content':新闻内容,'lable':类别标签,'weight':每条评论属于该类的权值},...]

    output:
        权值排在前一半的评论,数据示例:[{'_id':新闻id,'content':新闻内容,'lable':类别标签,'weight':每条评论属于该类的权值},...]
    '''
    words_list = []
    #评论按照权值大小降序排列
    idx = 0
    weight_dict = {}
    for item in items:
        weight_dict[idx] = item['weight']

    sorted_weight = sorted(weight_dict.iteritems(),
                           key=lambda asd: asd[1],
                           reverse=True)
    result_weight = sorted_weight[:topk_weight]

    half_item = []
    for r in result_weight:
        half_item.append(items[int(r[0])])

    for item in half_item:
        text = item['content']
        words = cut_words(text)
        words_list.extend(words)

    counter = Counter(words_list)
    total_weight = sum(dict(counter.most_common()).values())
    topk_words = counter.most_common(topk)
    keywords_dict = {k: v for k, v in topk_words}

    return keywords_dict, total_weight
예제 #16
0
def ad_filter(item, market_words=market_words):
    """
        按照简单规则过滤评论函数:将评论中@及后面用户名、表情符号去掉,只保留名词、动词、形容词词数大于3的评论
        input:
            item:评论数据,示例:{'_id':评论id,'content':评论内容}
        output:
            过滤后的评论数据, 增加了ad_label,0表示非垃圾; 增加了text_filter_ad,表示去除无用东西的文本
    """
    text = item['content']
    text = remove_at(text)
    text = remove_emoticon(text)
    words = cut_words(text)

    ad_label = 0  # 默认每条记录都不是垃圾
    if len(words) >= 3 and len(words) <= 20:
        if len(set(words) & set(market_words)):
            ad_label = 1
    else:
        ad_label = 1

    item['ad_label'] = ad_label
    item['text_filter_ad'] = text

    return item
예제 #17
0
def ad_filter(item, market_words=market_words):
    """
        按照简单规则过滤评论函数:将评论中@及后面用户名、表情符号去掉,只保留名词、动词、形容词词数大于3的评论
        input:
            item:评论数据,示例:{'_id':评论id,'content':评论内容}
        output:
            过滤后的评论数据, 增加了ad_label,0表示非垃圾; 增加了text_filter_ad,表示去除无用东西的文本
    """
    text = item['content']
    text = remove_at(text)
    text = remove_emoticon(text)
    words = cut_words(text)

    ad_label = 0 # 默认每条记录都不是垃圾
    if len(words) >= 3 and len(words)<=20:
        if len(set(words) & set(market_words)):
            ad_label = 1
    else:
        ad_label = 1

    item['ad_label'] = ad_label
    item['text_filter_ad'] = text

    return item