Пример #1
0
def segment_barrages(barrages, cid=None, is_corpus=False):
    if os.path.exists(FileUtil.get_word_segment_result_file_path(cid)):
        # 如果存在切词保存结果文件,那么直接从切词文件中读取
        return load_segment_barrages(cid)

    index = 0
    barrage_seg_list = []
    for barrage in barrages:
        barrage_seg = BarrageSeg(barrage.play_timestamp, barrage.sender_id, barrage.row_id, index)
        sentence_seg = __segment_sentence(barrage.content)  # 一条弹幕分词之后的结果
        if len(sentence_seg) <= 0:  # 对于其中的词语全部都被过滤的弹幕,不保存它的信息(防止影片末尾日期刷屏)
            continue
        barrage_seg.sentence_seg_list = sentence_seg
        barrage_seg_list.append(barrage_seg)
        index += 1
    if is_corpus is False:
        # 将分词结果写入测试文件中,检查分词情况
        __save_segment_word_to_file(barrage_seg_list, cid)
        # 将分词的结果以json的形式写入文件中,以供今后分析zscore的时候调用。
        save_segment_barrages(barrage_seg_list, cid)
        # 将视频v全体的弹幕数据作为语料库,便于生成tf-idf模型
        corpus = DictConfig.gen_corpus_info(barrage_seg_list, cid)
        # 以分好词的弹幕作为训练集,训练tf-idf模型
        DictConfig.gen_tfidf_model(corpus, cid)
        # 以分好词的弹幕作为训练集,训练lda模型
        DictConfig.gen_lda_model(corpus, cid)
    return barrage_seg_list
Пример #2
0
def segment_barrages(barrages, cid=None, is_corpus=False):
    if os.path.exists(FileUtil.get_word_segment_result_file_path(cid)):
        # 如果存在切词保存结果文件,那么直接从切词文件中读取
        return load_segment_barrages(cid)

    index = 0
    barrage_seg_list = []
    for barrage in barrages:
        barrage_seg = BarrageSeg(barrage.play_timestamp, barrage.sender_id,
                                 barrage.row_id, index)
        sentence_seg = __segment_sentence(barrage.content)  # 一条弹幕分词之后的结果
        if len(sentence_seg) <= 0:  # 对于其中的词语全部都被过滤的弹幕,不保存它的信息(防止影片末尾日期刷屏)
            continue
        barrage_seg.sentence_seg_list = sentence_seg
        barrage_seg_list.append(barrage_seg)
        index += 1
    if is_corpus is False:
        # 将分词结果写入测试文件中,检查分词情况
        __save_segment_word_to_file(barrage_seg_list, cid)
        # 将分词的结果以json的形式写入文件中,以供今后分析zscore的时候调用。
        save_segment_barrages(barrage_seg_list, cid)
        # 将视频v全体的弹幕数据作为语料库,便于生成tf-idf模型
        corpus = DictConfig.gen_corpus_info(barrage_seg_list, cid)
        # 以分好词的弹幕作为训练集,训练tf-idf模型
        DictConfig.gen_tfidf_model(corpus, cid)
        # 以分好词的弹幕作为训练集,训练lda模型
        DictConfig.gen_lda_model(corpus, cid)
    return barrage_seg_list
Пример #3
0
def get_barrage_from_live_text_file(file_path):
    # 首先 初始化我们需要的字典信息,如停用词词典、情感词典等等,为将来的处理步骤做准备。
    DictConfig.build_dicts()

    with codecs.open(file_path, "rb", "utf-8") as input_file:
        (folder, file_name) = os.path.split(file_path)
        barrage_start_datetime_str = file_name.split(".")[0] + " 12:00:00"  # 每场围棋比赛是当天12点开始的。
        barrage_start_datetime = datetime.datetime.strptime(barrage_start_datetime_str, "%Y-%m-%d %H:%M:%S")
        sender_name_list = []
        barrages = []
        for line in input_file:
            split_info = line.strip().split("\t")
            if len(split_info) < 3:
                continue
            datetime_str = split_info[0]
            sender_name = split_info[1]
            content = split_info[2]
            barrage_datetime = datetime.datetime.strptime(datetime_str, "%Y-%m-%d %H:%M:%S")
            if barrage_datetime < barrage_start_datetime:
                continue  # 比赛还未开始的弹幕略去
            barrage_timestamp = str((barrage_datetime - barrage_start_datetime).total_seconds())
            sender_name_list.append([sender_name])
            # 创建barrage对象
            barrage = Barrage(play_timestamp=barrage_timestamp, sender_id=sender_name, content=content)
            barrages.append(barrage)
        # 为每一个用户的名称对应一个唯一的数字表示
        dictionary = corpora.Dictionary(sender_name_list)
        dictionary.save("live_sender_name.dict")
        # 在将barrages中的barrage用户名称替换为刚刚生成的对应数字表示
        for barrage in barrages:
            barrage.sender_id = str(dictionary.token2id[barrage.sender_id])
        return barrages
Пример #4
0
 def __init__(self, cid):
     self.cid = cid
     # load_high_emotion_clips_from_file
     # 返回:(high_emotion_clips, global_zscore_threshold, left_zscore_threshold, right_zscore_threshould) 的元组
     #       high_emotion_clips [(left_border, right_border, left_border_seconds, right_border_seconds)]
     self.high_emotion_clips, self.global_zscore_threshold, self.left_zscore_threshold, \
     self.right_zscore_threshould = Zscore.load_high_emotion_clips_from_file(cid)
     # 获得做好分词处理、替换词处理、停词过滤、颜文字替换的弹幕分词列表
     self.barrage_seg_list = wordseg.load_segment_barrages(cid)
     self.barrage_count = len(self.barrage_seg_list)  # 对应视频含有的弹幕总数量
     self.emotion_dict = DictConfig.load_emotion_dict()
     self.degree_adverb_dict = DictConfig.load_degree_adverb_dict()
     self.negatives_dict = DictConfig.load_negatives_set()
     logger.debug(u"多维情感分类词典加载成功!!!!")
Пример #5
0
def is_accept_nominal(nominal):
    accept_nominal_set = DictConfig.get_accept_nominal_set()
    for accept_nominal in accept_nominal_set:
        # 因为词性都是大类之内再分为小类,如w、n等等;结巴分词的结果可能直接把小类分了出来,如wp、wn等等
        # 所以词性判断需要使用startwith来判断。
        if accept_nominal.startswith(nominal):
            return True
    return False
Пример #6
0
def is_accept_nominal(nominal):
    accept_nominal_set = DictConfig.get_accept_nominal_set()
    for accept_nominal in accept_nominal_set:
        # 因为词性都是大类之内再分为小类,如w、n等等;结巴分词的结果可能直接把小类分了出来,如wp、wn等等
        # 所以词性判断需要使用startwith来判断。
        if accept_nominal.startswith(nominal):
            return True
    return False
Пример #7
0
def is_num_or_punctuation(word, flag):
    reject_punctuation_set = DictConfig.get_reject_punctuation_dict()
    reject_word_flag_set = set(["w", "m"])  # set(["w", "m", "eng"])
    if flag in reject_word_flag_set:
        __record_reject_word_info(word, flag)  # 看看被过滤掉的都是什么词,调试用
        return True
    if word in reject_punctuation_set:
        return True
    return False
Пример #8
0
def is_num_or_punctuation(word, flag):
    reject_punctuation_set = DictConfig.get_reject_punctuation_dict()
    reject_word_flag_set = set(["w", "m"])  # set(["w", "m", "eng"])
    if flag in reject_word_flag_set:
        __record_reject_word_info(word, flag)  # 看看被过滤掉的都是什么词,调试用
        return True
    if word in reject_punctuation_set:
        return True
    return False
Пример #9
0
def judge_valid_word(word):
    emotion_dict = DictConfig.load_emotion_dict()
    # 填充情感词信息。
    emotion_word_dict = {}
    for category, word_info in emotion_dict.items():
        for word_item in word_info:
            word = word_item[0]
            if word not in emotion_word_dict.keys():
                emotion_word_dict[word] = category
    # 否定词词典
    negatives_set = DictConfig.load_negatives_set()
    # 程度副词词典
    degree_adverb_dict = DictConfig.load_degree_adverb_dict()
    if word in emotion_dict.keys():
        return True, emotion_dict[word]
    elif (word in negatives_set) or (word in degree_adverb_dict.keys()):
        return True, word
    else:
        return False, None
Пример #10
0
def judge_valid_word(word):
    emotion_dict = DictConfig.load_emotion_dict()
    # 填充情感词信息。
    emotion_word_dict = {}
    for category, word_info in emotion_dict.items():
        for word_item in word_info:
            word = word_item[0]
            if word not in emotion_word_dict.keys():
                emotion_word_dict[word] = category
    # 否定词词典
    negatives_set = DictConfig.load_negatives_set()
    # 程度副词词典
    degree_adverb_dict = DictConfig.load_degree_adverb_dict()
    if word in emotion_dict.keys():
        return True, emotion_dict[word]
    elif (word in negatives_set) or (word in degree_adverb_dict.keys()):
        return True, word
    else:
        return False, None
Пример #11
0
def get_barrage_from_txt_file(txt_file_path, order_flag=False):
    # 首先 初始化我们需要的字典信息,如停用词词典、情感词典等等,为将来的处理步骤做准备。
    DictConfig.build_dicts()

    barrages = []
    with codecs.open(txt_file_path, "rb", "utf-8") as input_file:
        for barrage in input_file:
            # 弹幕信息的格式:play_timestamp type font_size font_color unix_timestamp pool sender_id row_id content
            split_info = barrage.strip().split(u"\t")
            if len(split_info) < 9:
                # 有些弹幕数据没有内容(content)这一列的内容,对于这些弹幕过滤掉。
                continue
            barrage = Barrage(split_info[0], split_info[1], split_info[2], split_info[3], split_info[4], split_info[5],
                              split_info[6], split_info[7], split_info[8])
            barrages.append(barrage)
    barrages = sort_barrages(barrages, order_flag)
    # barrages = sorted(barrages, key=lambda barrage: barrage.play_timestamp)
    # 将 barrage 中所有的 sender_id 信息存储起来。以备后期生成相似度矩阵。
    BarrageInfo.collect_barrage_sender_id(barrages)
    return barrages
Пример #12
0
def format_word(word, flag):
    replace_word_list = DictConfig.get_replace_words_list()
    for replace_pattern_info in replace_word_list:
        replace_pattern = replace_pattern_info[0]
        replace_word = replace_pattern_info[1]
        replace_flag = replace_pattern_info[2]
        pattern = re.compile(replace_pattern)
        match = re.match(pattern, word)
        if match is not None:
            return True, replace_word, replace_flag
    return False, word, flag
Пример #13
0
def format_word(word, flag):
    replace_word_list = DictConfig.get_replace_words_list()
    for replace_pattern_info in replace_word_list:
        replace_pattern = replace_pattern_info[0]
        replace_word = replace_pattern_info[1]
        replace_flag = replace_pattern_info[2]
        pattern = re.compile(replace_pattern)
        match = re.match(pattern, word)
        if match is not None:
            return True, replace_word, replace_flag
    return False, word, flag
Пример #14
0
def get_barrage_from_txt_file(txt_file_path, order_flag=False):
    # 首先 初始化我们需要的字典信息,如停用词词典、情感词典等等,为将来的处理步骤做准备。
    DictConfig.build_dicts()

    barrages = []
    with codecs.open(txt_file_path, "rb", "utf-8") as input_file:
        for barrage in input_file:
            # 弹幕信息的格式:play_timestamp type font_size font_color unix_timestamp pool sender_id row_id content
            split_info = barrage.strip().split(u"\t")
            if len(split_info) < 9:
                # 有些弹幕数据没有内容(content)这一列的内容,对于这些弹幕过滤掉。
                continue
            barrage = Barrage(split_info[0], split_info[1], split_info[2],
                              split_info[3], split_info[4], split_info[5],
                              split_info[6], split_info[7], split_info[8])
            barrages.append(barrage)
    barrages = sort_barrages(barrages, order_flag)
    # barrages = sorted(barrages, key=lambda barrage: barrage.play_timestamp)
    # 将 barrage 中所有的 sender_id 信息存储起来。以备后期生成相似度矩阵。
    BarrageInfo.collect_barrage_sender_id(barrages)
    return barrages
Пример #15
0
def get_barrage_from_live_text_file(file_path):
    # 首先 初始化我们需要的字典信息,如停用词词典、情感词典等等,为将来的处理步骤做准备。
    DictConfig.build_dicts()

    with codecs.open(file_path, "rb", "utf-8") as input_file:
        (folder, file_name) = os.path.split(file_path)
        barrage_start_datetime_str = file_name.split(
            ".")[0] + " 12:00:00"  # 每场围棋比赛是当天12点开始的。
        barrage_start_datetime = datetime.datetime.strptime(
            barrage_start_datetime_str, "%Y-%m-%d %H:%M:%S")
        sender_name_list = []
        barrages = []
        for line in input_file:
            split_info = line.strip().split("\t")
            if len(split_info) < 3:
                continue
            datetime_str = split_info[0]
            sender_name = split_info[1]
            content = split_info[2]
            barrage_datetime = datetime.datetime.strptime(
                datetime_str, "%Y-%m-%d %H:%M:%S")
            if barrage_datetime < barrage_start_datetime:
                continue  # 比赛还未开始的弹幕略去
            barrage_timestamp = str(
                (barrage_datetime - barrage_start_datetime).total_seconds())
            sender_name_list.append([sender_name])
            # 创建barrage对象
            barrage = Barrage(play_timestamp=barrage_timestamp,
                              sender_id=sender_name,
                              content=content)
            barrages.append(barrage)
        # 为每一个用户的名称对应一个唯一的数字表示
        dictionary = corpora.Dictionary(sender_name_list)
        dictionary.save("live_sender_name.dict")
        # 在将barrages中的barrage用户名称替换为刚刚生成的对应数字表示
        for barrage in barrages:
            barrage.sender_id = str(dictionary.token2id[barrage.sender_id])
        return barrages
Пример #16
0
def save_segment_barrages(barrage_seg_list, cid):
    save_file_path = FileUtil.get_word_segment_result_file_path(cid)
    json_str = json.dumps(barrage_seg_list, default=lambda obj: obj.__dict__)
    with codecs.open(save_file_path, "wb", "utf-8") as output_file:
        output_file.write(json_str)


# 将切词结果从文件中读出,文件中的字符串为json的格式
def load_segment_barrages(cid):
    json_data = []
    file_path = FileUtil.get_word_segment_result_file_path(cid)
    with codecs.open(file_path, "rb", "utf-8") as input_file:
        for line in input_file:
            json_data.append(line)
    json_str = u"".join(json_data)
    barrage_seg_list_json = json.loads(json_str)
    barrage_seg_list = BarrageSeg.dict2barrageseglist(barrage_seg_list_json)
    return barrage_seg_list


if __name__ == "__main__":
    DictConfig.build_dicts()
    sentence_list = [u"你终于承认完全不懂了!!!!!!!!!!", u"哈哈哈哈哈哈哈哈哈", u"(´▽`)ノ♪(´▽`)ノ♪(´▽`)ノ♪(´▽`)ノ♪", u"(╬゚д゚)▄︻┻┳═一(╬゚д゚)▄︻",
                     u"(╬゚д゚)▄︻┻┳═一呀(╬゚д゚)▄︻",
                     u"哈(╬゚д゚)▄︻┻┳═一不(╬゚д゚)▄︻", u"你是不是傻(╬゚д゚)▄︻┻┳═一(╬゚д゚)▄︻",
                     u"123"]
    for sentence in sentence_list:
        sentence_seg = __segment_sentence(sentence)
        for word_seg in sentence_seg:
            print word_seg.word, u"\t", word_seg.flag, u"\t", word_seg.start_position, u"\t", word_seg.end_position
Пример #17
0
    save_file_path = FileUtil.get_word_segment_result_file_path(cid)
    json_str = json.dumps(barrage_seg_list, default=lambda obj: obj.__dict__)
    with codecs.open(save_file_path, "wb", "utf-8") as output_file:
        output_file.write(json_str)


# 将切词结果从文件中读出,文件中的字符串为json的格式
def load_segment_barrages(cid):
    json_data = []
    file_path = FileUtil.get_word_segment_result_file_path(cid)
    with codecs.open(file_path, "rb", "utf-8") as input_file:
        for line in input_file:
            json_data.append(line)
    json_str = u"".join(json_data)
    barrage_seg_list_json = json.loads(json_str)
    barrage_seg_list = BarrageSeg.dict2barrageseglist(barrage_seg_list_json)
    return barrage_seg_list


if __name__ == "__main__":
    DictConfig.build_dicts()
    sentence_list = [
        u"你终于承认完全不懂了!!!!!!!!!!", u"哈哈哈哈哈哈哈哈哈", u"(´▽`)ノ♪(´▽`)ノ♪(´▽`)ノ♪(´▽`)ノ♪",
        u"(╬゚д゚)▄︻┻┳═一(╬゚д゚)▄︻", u"(╬゚д゚)▄︻┻┳═一呀(╬゚д゚)▄︻",
        u"哈(╬゚д゚)▄︻┻┳═一不(╬゚д゚)▄︻", u"你是不是傻(╬゚д゚)▄︻┻┳═一(╬゚д゚)▄︻", u"123"
    ]
    for sentence in sentence_list:
        sentence_seg = __segment_sentence(sentence)
        for word_seg in sentence_seg:
            print word_seg.word, u"\t", word_seg.flag, u"\t", word_seg.start_position, u"\t", word_seg.end_position
Пример #18
0
def replace_emoji_to_word(word, flag, word_start_position, word_end_position):
    emoji_replace_dict = DictConfig.get_emoji_replace_dict()
    emoji_set = emoji_replace_dict.keys()
    result_emoji = []  # 因为有些人两个颜文字会连发,所以需要辨别其中的多个表情。
    first_match_flag = False
    emoji_start_position = word_start_position - 1
    emoji_end_position = word_start_position - 1
    # 从word 头部开始判断emoji表情是否存在
    while word != "":
        find_flag = False
        for emoji in emoji_set:
            if word == emoji:
                emoji_start_position = emoji_end_position + 1  # 当前表情在原句中的起始位置
                emoji_end_position = emoji_start_position - 1 + len(emoji)  # 当前表情在原句中的结束位置
                if len(result_emoji) <= 0:
                    if len(word) == 1:
                        return True, [(emoji_replace_dict[emoji], "emoji", emoji_start_position, emoji_end_position)]
                    return True, [(emoji_replace_dict[emoji], flag, emoji_start_position, emoji_end_position)]
                else:
                    if len(word) == 1:
                        result_emoji.append((emoji_replace_dict[emoji], "emoji",
                                             emoji_start_position, emoji_end_position))
                    result_emoji.append((emoji_replace_dict[emoji], flag, emoji_start_position, emoji_end_position))
                    return True, result_emoji
            # 多个颜文字一起发的情况,解决判断多个重复颜文字问题。
            find_flag = word.startswith(emoji)
            if find_flag:
                emoji_start_position = emoji_end_position + 1  # 当前表情在原句中的起始位置
                emoji_end_position = emoji_start_position - 1 + len(emoji)  # 当前表情在原句中的结束位置
                result_emoji.append((emoji_replace_dict[emoji], "emoji", emoji_start_position, emoji_end_position))
                word = word.replace(emoji, "", 1)
                if not first_match_flag:
                    first_match_flag = True
                break
        if not find_flag:  # 没有找到对应的表情
            break

    # 从头开始匹配失败,那么从尾开始匹配试一试。
    if not first_match_flag:
        while word != "":
            find_flag = False
            for emoji in emoji_set:
                # 多个颜文字一起发的情况,解决判断多个重复颜文字问题。
                find_flag = word.endswith(emoji)
                if find_flag:
                    emoji_start_position = emoji_end_position + 1  # 当前表情在原句中的起始位置
                    emoji_end_position = emoji_start_position - 1 + len(emoji)  # 当前表情在原句中的结束位置
                    result_emoji.append((emoji_replace_dict[emoji], "emoji", emoji_start_position, emoji_end_position))
                    word = word.replace(emoji, "", 1)
                    break
            if not find_flag:  # 没有找到对应的表情
                break

    # if word != "":
    #     result_emoji.append(word, "emoji-unknow")  # 没有被收录到词典中的表情
    if len(result_emoji) <= 0:
        if flag == "emoji":  # 没有识别出来的颜文字。。。或者符号。。。舍弃掉。。。
            __record_reject_word_info(word, flag)  # 调试用,看看都舍弃了一些什么符号。
            return False, None
        else:
            return False, [(word, flag)]
    else:
        return True, result_emoji
Пример #19
0
def is_stopwords(word):
    stopwords_set = DictConfig.get_stopwords_set()
    if word in stopwords_set:
        return True
    else:
        return False
Пример #20
0
def is_stopwords(word):
    stopwords_set = DictConfig.get_stopwords_set()
    if word in stopwords_set:
        return True
    else:
        return False
Пример #21
0
def replace_emoji_to_word(word, flag, word_start_position, word_end_position):
    emoji_replace_dict = DictConfig.get_emoji_replace_dict()
    emoji_set = emoji_replace_dict.keys()
    result_emoji = []  # 因为有些人两个颜文字会连发,所以需要辨别其中的多个表情。
    first_match_flag = False
    emoji_start_position = word_start_position - 1
    emoji_end_position = word_start_position - 1
    # 从word 头部开始判断emoji表情是否存在
    while word != "":
        find_flag = False
        for emoji in emoji_set:
            if word == emoji:
                emoji_start_position = emoji_end_position + 1  # 当前表情在原句中的起始位置
                emoji_end_position = emoji_start_position - 1 + len(
                    emoji)  # 当前表情在原句中的结束位置
                if len(result_emoji) <= 0:
                    if len(word) == 1:
                        return True, [
                            (emoji_replace_dict[emoji], "emoji",
                             emoji_start_position, emoji_end_position)
                        ]
                    return True, [(emoji_replace_dict[emoji], flag,
                                   emoji_start_position, emoji_end_position)]
                else:
                    if len(word) == 1:
                        result_emoji.append(
                            (emoji_replace_dict[emoji], "emoji",
                             emoji_start_position, emoji_end_position))
                    result_emoji.append(
                        (emoji_replace_dict[emoji], flag, emoji_start_position,
                         emoji_end_position))
                    return True, result_emoji
            # 多个颜文字一起发的情况,解决判断多个重复颜文字问题。
            find_flag = word.startswith(emoji)
            if find_flag:
                emoji_start_position = emoji_end_position + 1  # 当前表情在原句中的起始位置
                emoji_end_position = emoji_start_position - 1 + len(
                    emoji)  # 当前表情在原句中的结束位置
                result_emoji.append((emoji_replace_dict[emoji], "emoji",
                                     emoji_start_position, emoji_end_position))
                word = word.replace(emoji, "", 1)
                if not first_match_flag:
                    first_match_flag = True
                break
        if not find_flag:  # 没有找到对应的表情
            break

    # 从头开始匹配失败,那么从尾开始匹配试一试。
    if not first_match_flag:
        while word != "":
            find_flag = False
            for emoji in emoji_set:
                # 多个颜文字一起发的情况,解决判断多个重复颜文字问题。
                find_flag = word.endswith(emoji)
                if find_flag:
                    emoji_start_position = emoji_end_position + 1  # 当前表情在原句中的起始位置
                    emoji_end_position = emoji_start_position - 1 + len(
                        emoji)  # 当前表情在原句中的结束位置
                    result_emoji.append(
                        (emoji_replace_dict[emoji], "emoji",
                         emoji_start_position, emoji_end_position))
                    word = word.replace(emoji, "", 1)
                    break
            if not find_flag:  # 没有找到对应的表情
                break

    # if word != "":
    #     result_emoji.append(word, "emoji-unknow")  # 没有被收录到词典中的表情
    if len(result_emoji) <= 0:
        if flag == "emoji":  # 没有识别出来的颜文字。。。或者符号。。。舍弃掉。。。
            __record_reject_word_info(word, flag)  # 调试用,看看都舍弃了一些什么符号。
            return False, None
        else:
            return False, [(word, flag)]
    else:
        return True, result_emoji