示例#1
0
def feature_about():
    # 获取特征列表
    feature_dict = NewsUtil.get_feature()
    # 获取新闻中出现特征后最近的5个词及其属性
    logger.info("In Prepare Raw News...")
    raw_news_data = CommonUtil.read_excel(RAW_NEWS_DEMO_PATH)
    raw_news_table = raw_news_data.sheet_by_index(0)
    raw_news_rows = raw_news_table.nrows
    segmentor = Segmentor()  # 初始化实例
    segmentor.load_with_lexicon(cws_model_path,
                                CFETSFX_LEXICON_PATH)  # 加载模型,第二个参数是您的外部词典文件路径
    feature_about_list = list()
    for rowN in range(0, raw_news_rows):
        news_content = raw_news_table.cell_value(rowN, 2)
        sentences = SentenceSplitter.split(news_content)
        for sentence in sentences:
            print(sentence)
            # 分词
            words = segmentor.segment(sentence)
            print(list(words))
            for word_index in range(0, len(words)):
                word = words[word_index]
                for feature_word in feature_dict.values():
                    if feature_word in word:
                        about_list = list()
                        count = 0
                        while word_index < len(words) and count < 6:
                            about_list.append(words[word_index])
                            count += 1
                            word_index += 1
                        feature_about_list.append(about_list)
                        print(about_list)
                        break
    segmentor.release()
    CommonUtil.write_csv(FEATURE_ABOUT_PATH, feature_about_list)
示例#2
0
def prepare_feature():
    logger.info("In Prepare Feature...")
    # 获取sheet
    feature_data = CommonUtil.read_excel(FEATURE_PATH)
    feature_table = feature_data.sheet_by_index(0)
    # 获取总行数
    feature_rows = feature_table.nrows
    # 获取总列数
    # feature_cols = feature_table.ncols
    for rowNum in range(1, feature_rows):
        key = feature_table.cell_value(rowNum, 0)
        value = feature_table.cell_value(rowNum, 1)
        featureDict[key] = value
    logger.info("Prepare Feature...Done!")
示例#3
0
def get_feature():
    # 特征字典[AAAA:黄金]
    feature_dict = dict()
    # 获取sheet
    feature_data = CommonUtil.read_excel(FEATURE_PATH)
    feature_table = feature_data.sheet_by_index(0)
    # 获取总行数
    feature_rows = feature_table.nrows
    # 获取总列数
    # feature_cols = feature_table.ncols
    for rowNum in range(1, feature_rows):
        key = feature_table.cell_value(rowNum, 0)
        value = feature_table.cell_value(rowNum, 1)
        feature_dict[key] = value
    return feature_dict
示例#4
0
def prepare_raw_news():
    logger.info("In Prepare Raw News...")
    raw_news_data = CommonUtil.read_excel(RAW_NEWS_PATH)
    raw_news_table = raw_news_data.sheet_by_index(0)
    raw_news_rows = raw_news_table.nrows
    for rowN in range(0, raw_news_rows):
        news_item = list()
        news_index = int(raw_news_table.cell_value(rowN, 0))
        news_time = CommonUtil.get_datetime_from_cell(
            raw_news_table.cell_value(rowN, 1))
        news_content = raw_news_table.cell_value(rowN, 2)
        news_item.append(news_index)
        news_item.append(news_time)
        news_item.append(news_content)
        newsList.append(news_item)
    logger.info("Prepare Raw News...Done!")