"""训练数据与测试数据随机排序,去掉停用词 """ import codecs import os import sys from tqdm import tqdm sys.path.insert(0, './') # 定义搜索路径的优先顺序,序号从0开始,表示最大优先级 import myClue # noqa print('myClue module path :{}'.format(myClue.__file__)) # 输出测试模块文件位置 from myClue.core import logger # noqa from myClue.tools.file import read_file_texts # noqa from myClue.tools.file import init_file_path # noqa stopwords = set(read_file_texts('./data/stopwords/stopwords_mix.txt')) logger.info('stopwords len:{}, example:{}'.format(len(stopwords), list(stopwords)[:20])) def news_content_process(news_content): """"数据转换处理""" words = news_content.split(' ') words = [word for word in words if word not in stopwords] return ' '.join(words) if __name__ == "__main__": train_file_config = { 'train': './data/UCAS_NLP_TC/data_01_shuffle/traindata.txt', 'dev': './data/UCAS_NLP_TC/data_01_shuffle/devdata.txt',
from myClue.core import logger # noqa from myClue.tools.file import read_file_texts # noqa from myClue.tools.file import init_file_path # noqa if __name__ == "__main__": train_file_config = { 'train': './data/UCAS_NLP_TC/data_baidu_cws/train_cws.json', 'dev': './data/UCAS_NLP_TC/data_baidu_cws/dev_cws.json', 'test': './data/UCAS_NLP_TC/data_baidu_cws/test_cws.json', } output_path = './data/UCAS_NLP_TC/data_11_baidu_nerwords' init_file_path(output_path) for file_label, file_name in train_file_config.items(): logger.info('开始处理:{}'.format(file_label)) texts = read_file_texts(file_name) output_file_name = os.path.join(output_path, '{}data.txt'.format(file_label)) with codecs.open(output_file_name, mode='w', encoding='utf8') as fw: for text in tqdm(texts): row_data = json.loads(text) label = row_data['label'] cws_items = row_data['cws_items'] words = list() item_filter = set() original_words = list() for cws_item in cws_items: item_text = cws_item['item'] original_words.extend(cws_item['basic_words']) if cws_item['ne'] in {'ORG', 'PER', 'LOC', 'nr', 'ns', 'nt', 'nw', 'nz'}: if item_text in item_filter: continue
import myClue # noqa print('myClue module path :{}'.format(myClue.__file__)) # 输出测试模块文件位置 from myClue.core import logger # noqa from myClue.tools.file import read_file_texts # noqa from myClue.tools.file import init_file_path # noqa if __name__ == "__main__": train_file_config = { 'train': './data/UCAS_NLP_TC/data_baidu_trans/train_trans.json', 'dev': './data/UCAS_NLP_TC/data_baidu_trans/dev_trans.json', 'test': './data/UCAS_NLP_TC/data_baidu_trans/test_trans.json', } output_path = './data/UCAS_NLP_TC/data_English/data_01_fasttext' init_file_path(output_path) texts = read_file_texts(train_file_config['train']) texts.extend(read_file_texts(train_file_config['dev'])) random.shuffle(texts) output_file_name = os.path.join(output_path, 'train_data.txt') with codecs.open(output_file_name, mode='w', encoding='utf8') as fw: for text in tqdm(texts): row_data = json.loads(text) label = row_data['label'] trans_results = row_data['trans_results'] eng_texts = list() for trans_result in trans_results: eng_texts.append(trans_result['dst']) news_content = ' '.join(eng_texts) if len(news_content) == 0: continue fw.write('__label__{}\t{}\n'.format(label, news_content))
"""提取train中的所有单词""" import codecs import sys from tqdm import tqdm from collections import OrderedDict sys.path.insert(0, './') # 定义搜索路径的优先顺序,序号从0开始,表示最大优先级 import myClue # noqa print('myClue module path :{}'.format(myClue.__file__)) # 输出测试模块文件位置 from myClue.core import logger # noqa from myClue.tools.file import read_file_texts # noqa from myClue.tools.file import init_file_path # noqa if __name__ == "__main__": # train_file = './data/UCAS_NLP_TC/traindata.txt' # output_file = './data/UCAS_NLP_TC/train_words.txt' train_file = './data/UCAS_NLP_TC/data_11_baidu_basicwords/traindata.txt' output_file = './data/UCAS_NLP_TC/train_baidu_basicwords.txt' texts = read_file_texts(train_file) with codecs.open(output_file, mode='w', encoding='utf8') as fw: words = OrderedDict() for text in tqdm(texts): label, news_content = text.split('\t') row_words = news_content.split(' ') for word in row_words: words[word] = None for word in tqdm(words): fw.write('{}\n'.format(word))