def bulid_dataset(args, reader, vocab, debug=False): word2idx, char2idx, word_emb = None, None, None train_src = args.input + "/train_data.json" dev_src = args.input + "/dev_data.json" train_examples_file = args.cache_data + "/train-examples.pkl" dev_examples_file = args.cache_data + "/dev-examples.pkl" word_emb_file = args.cache_data + "/word_emb.pkl" char_dictionary = args.cache_data + "/char_dict.pkl" word_dictionary = args.cache_data + "/word_dict.pkl" if not os.path.exists(train_examples_file): train_examples = reader.read_examples(train_src, data_type='train') dev_examples = reader.read_examples(dev_src, data_type='dev') if not args.use_bert: # todo : min_word_count=3 ? vocab.build_vocab_only_with_char(train_examples, min_char_count=2, min_word_count=5) if args.use_word2vec and args.embedding_file: word_emb = vocab.make_embedding(vocab=vocab.word_vocab, embedding_file=args.embedding_file, emb_size=args.word_emb_size) save(word_emb_file, word_emb, message="word_emb embedding") save(char_dictionary, vocab.char2idx, message="char dictionary") save(word_dictionary, vocab.word2idx, message="char dictionary") char2idx = vocab.char2idx word2idx = vocab.word2idx save(train_examples_file, train_examples, message="train examples") save(dev_examples_file, dev_examples, message="dev examples") else: if not args.use_bert: if args.use_word2vec and args.embedding_file: word_emb = load(word_emb_file) char2idx = load(char_dictionary) word2idx = load(word_dictionary) logging.info("total char vocabulary size is {} ".format(len(char2idx))) logging.info("total word vocabulary size is {} ".format(len(word2idx))) train_examples, dev_examples = load(train_examples_file), load(dev_examples_file) logging.info('train examples size is {}'.format(len(train_examples))) logging.info('dev examples size is {}'.format(len(dev_examples))) if not args.use_bert: args.char_vocab_size = len(char2idx) convert_examples_features = Feature(args, char2idx=char2idx, word2idx=word2idx) train_examples = train_examples[:5000] if debug else train_examples dev_examples = dev_examples[:5000] if debug else dev_examples train_data_set = convert_examples_features(train_examples, data_type='train') dev_data_set = convert_examples_features(dev_examples, data_type='dev') train_data_loader = train_data_set.get_dataloader(args.train_batch_size, shuffle=True, pin_memory=args.pin_memory) dev_data_loader = dev_data_set.get_dataloader(args.train_batch_size) data_loaders = train_data_loader, dev_data_loader eval_examples = train_examples, dev_examples return eval_examples, data_loaders, word_emb
def bulid_dataset(args, spo_config, reader,tokenizer, debug=False): train_src = args.input + "/train_data.json" dev_src = args.input + "/test2_data.json" train_examples_file = args.cache_data + "/train-examples.pkl" dev_examples_file = args.cache_data + "/dev-examples.pkl" if not os.path.exists(train_examples_file): train_examples = reader.read_examples(train_src, data_type='train') dev_examples = reader.read_examples(dev_src, data_type='dev') save(train_examples_file, train_examples, message="train examples") save(dev_examples_file, dev_examples, message="dev examples") else: logging.info('loading train cache_data {}'.format(train_examples_file)) logging.info('loading dev cache_data {}'.format(dev_examples_file)) train_examples, dev_examples = load(train_examples_file), load(dev_examples_file) logging.info('train examples size is {}'.format(len(train_examples))) logging.info('dev examples size is {}'.format(len(dev_examples))) convert_examples_features = Feature(max_len=args.max_len, spo_config=spo_config, tokenizer=tokenizer) train_examples = train_examples[:2] if debug else train_examples dev_examples = dev_examples[:2] if debug else dev_examples train_data_set = convert_examples_features(train_examples, data_type='train') dev_data_set = convert_examples_features(dev_examples, data_type='dev') train_data_loader = train_data_set.get_dataloader(args.train_batch_size, shuffle=True, pin_memory=args.pin_memory) dev_data_loader = dev_data_set.get_dataloader(args.train_batch_size) data_loaders = train_data_loader, dev_data_loader eval_examples = train_examples, dev_examples return eval_examples, data_loaders, tokenizer
from data.xpath_file import JianDanXpath from data.url_file import JianDanUrl from lxml import etree import html as hhh from utils import file_util import re req = RequestFactory() html = req.getRequest(url=JianDanUrl.index_url) index_data = req.find(html, JianDanXpath.index) title_list = [] for i in index_data: temp = {} temp['title'] = req.find(i, JianDanXpath.index_title)[0] temp['href'] = req.find(i, JianDanXpath.index_title_link)[0] title_list.append(temp) print(title_list.__str__()) for i in title_list: temp_html = req.getRequest(url=i['href']) i['content'] = hhh.unescape(str(etree.tostring(req.find(temp_html, JianDanXpath.content)[0], encoding='utf-8'), encoding='utf-8')) pt = r'<p>.*</p>' rept = re.compile(pt) temp = re.findall(rept, i['content']) s = '' for j in range(temp.__len__()): s = s + temp[j] i['result'] = s for i in title_list: file_util.save(r'JianDan\\' + i['title'] + '.html', i['result'])