"""训练数据与测试数据随机排序,去掉停用词
"""
import codecs
import os
import sys
from tqdm import tqdm

sys.path.insert(0, './')  # 定义搜索路径的优先顺序,序号从0开始,表示最大优先级

import myClue  # noqa
print('myClue module path :{}'.format(myClue.__file__))  # 输出测试模块文件位置
from myClue.core import logger  # noqa
from myClue.tools.file import read_file_texts  # noqa
from myClue.tools.file import init_file_path  # noqa

stopwords = set(read_file_texts('./data/stopwords/stopwords_mix.txt'))
logger.info('stopwords len:{}, example:{}'.format(len(stopwords),
                                                  list(stopwords)[:20]))


def news_content_process(news_content):
    """"数据转换处理"""
    words = news_content.split(' ')
    words = [word for word in words if word not in stopwords]
    return ' '.join(words)


if __name__ == "__main__":
    train_file_config = {
        'train': './data/UCAS_NLP_TC/data_01_shuffle/traindata.txt',
        'dev': './data/UCAS_NLP_TC/data_01_shuffle/devdata.txt',
예제 #2
0
from myClue.core import logger  # noqa
from myClue.tools.file import read_file_texts  # noqa
from myClue.tools.file import init_file_path  # noqa


if __name__ == "__main__":
    train_file_config = {
        'train': './data/UCAS_NLP_TC/data_baidu_cws/train_cws.json',
        'dev': './data/UCAS_NLP_TC/data_baidu_cws/dev_cws.json',
        'test': './data/UCAS_NLP_TC/data_baidu_cws/test_cws.json',
    }
    output_path = './data/UCAS_NLP_TC/data_11_baidu_nerwords'
    init_file_path(output_path)
    for file_label, file_name in train_file_config.items():
        logger.info('开始处理:{}'.format(file_label))
        texts = read_file_texts(file_name)
        output_file_name = os.path.join(output_path, '{}data.txt'.format(file_label))
        with codecs.open(output_file_name, mode='w', encoding='utf8') as fw:
            for text in tqdm(texts):
                row_data = json.loads(text)
                label = row_data['label']
                cws_items = row_data['cws_items']
                words = list()
                item_filter = set()
                original_words = list()
                for cws_item in cws_items:
                    item_text = cws_item['item']
                    original_words.extend(cws_item['basic_words'])
                    if cws_item['ne'] in {'ORG', 'PER', 'LOC', 'nr', 'ns', 'nt', 'nw', 'nz'}:
                        if item_text in item_filter:
                            continue
예제 #3
0
import myClue  # noqa
print('myClue module path :{}'.format(myClue.__file__))  # 输出测试模块文件位置
from myClue.core import logger  # noqa
from myClue.tools.file import read_file_texts  # noqa
from myClue.tools.file import init_file_path  # noqa

if __name__ == "__main__":
    train_file_config = {
        'train': './data/UCAS_NLP_TC/data_baidu_trans/train_trans.json',
        'dev': './data/UCAS_NLP_TC/data_baidu_trans/dev_trans.json',
        'test': './data/UCAS_NLP_TC/data_baidu_trans/test_trans.json',
    }
    output_path = './data/UCAS_NLP_TC/data_English/data_01_fasttext'
    init_file_path(output_path)
    texts = read_file_texts(train_file_config['train'])
    texts.extend(read_file_texts(train_file_config['dev']))
    random.shuffle(texts)
    output_file_name = os.path.join(output_path, 'train_data.txt')
    with codecs.open(output_file_name, mode='w', encoding='utf8') as fw:
        for text in tqdm(texts):
            row_data = json.loads(text)
            label = row_data['label']
            trans_results = row_data['trans_results']
            eng_texts = list()
            for trans_result in trans_results:
                eng_texts.append(trans_result['dst'])
            news_content = ' '.join(eng_texts)
            if len(news_content) == 0:
                continue
            fw.write('__label__{}\t{}\n'.format(label, news_content))
예제 #4
0
"""提取train中的所有单词"""
import codecs
import sys
from tqdm import tqdm
from collections import OrderedDict

sys.path.insert(0, './')  # 定义搜索路径的优先顺序,序号从0开始,表示最大优先级

import myClue  # noqa

print('myClue module path :{}'.format(myClue.__file__))  # 输出测试模块文件位置
from myClue.core import logger  # noqa
from myClue.tools.file import read_file_texts  # noqa
from myClue.tools.file import init_file_path  # noqa

if __name__ == "__main__":
    # train_file = './data/UCAS_NLP_TC/traindata.txt'
    # output_file = './data/UCAS_NLP_TC/train_words.txt'
    train_file = './data/UCAS_NLP_TC/data_11_baidu_basicwords/traindata.txt'
    output_file = './data/UCAS_NLP_TC/train_baidu_basicwords.txt'
    texts = read_file_texts(train_file)
    with codecs.open(output_file, mode='w', encoding='utf8') as fw:
        words = OrderedDict()
        for text in tqdm(texts):
            label, news_content = text.split('\t')
            row_words = news_content.split(' ')
            for word in row_words:
                words[word] = None
        for word in tqdm(words):
            fw.write('{}\n'.format(word))