Пример #1
0
def bulid_dataset(args, reader, vocab, debug=False):
    word2idx, char2idx, word_emb = None, None, None
    train_src = args.input + "/train_data.json"
    dev_src = args.input + "/dev_data.json"

    train_examples_file = args.cache_data + "/train-examples.pkl"
    dev_examples_file = args.cache_data + "/dev-examples.pkl"

    word_emb_file = args.cache_data + "/word_emb.pkl"
    char_dictionary = args.cache_data + "/char_dict.pkl"
    word_dictionary = args.cache_data + "/word_dict.pkl"

    if not os.path.exists(train_examples_file):

        train_examples = reader.read_examples(train_src, data_type='train')
        dev_examples = reader.read_examples(dev_src, data_type='dev')

        if not args.use_bert:
            # todo : min_word_count=3 ?
            vocab.build_vocab_only_with_char(train_examples, min_char_count=2, min_word_count=5)
            if args.use_word2vec and args.embedding_file:
                word_emb = vocab.make_embedding(vocab=vocab.word_vocab,
                                                embedding_file=args.embedding_file,
                                                emb_size=args.word_emb_size)
                save(word_emb_file, word_emb, message="word_emb embedding")
            save(char_dictionary, vocab.char2idx, message="char dictionary")
            save(word_dictionary, vocab.word2idx, message="char dictionary")
            char2idx = vocab.char2idx
            word2idx = vocab.word2idx
        save(train_examples_file, train_examples, message="train examples")
        save(dev_examples_file, dev_examples, message="dev examples")
    else:
        if not args.use_bert:
            if args.use_word2vec and args.embedding_file:
                word_emb = load(word_emb_file)
            char2idx = load(char_dictionary)
            word2idx = load(word_dictionary)
            logging.info("total char vocabulary size is {} ".format(len(char2idx)))
            logging.info("total word vocabulary size is {} ".format(len(word2idx)))
        train_examples, dev_examples = load(train_examples_file), load(dev_examples_file)

        logging.info('train examples size is {}'.format(len(train_examples)))
        logging.info('dev examples size is {}'.format(len(dev_examples)))

    if not args.use_bert:
        args.char_vocab_size = len(char2idx)
    convert_examples_features = Feature(args, char2idx=char2idx, word2idx=word2idx)

    train_examples = train_examples[:5000] if debug else train_examples
    dev_examples = dev_examples[:5000] if debug else dev_examples

    train_data_set = convert_examples_features(train_examples, data_type='train')
    dev_data_set = convert_examples_features(dev_examples, data_type='dev')
    train_data_loader = train_data_set.get_dataloader(args.train_batch_size, shuffle=True, pin_memory=args.pin_memory)
    dev_data_loader = dev_data_set.get_dataloader(args.train_batch_size)

    data_loaders = train_data_loader, dev_data_loader
    eval_examples = train_examples, dev_examples

    return eval_examples, data_loaders, word_emb
Пример #2
0
def bulid_dataset(args, spo_config, reader,tokenizer, debug=False):
    train_src = args.input + "/train_data.json"
    dev_src = args.input + "/test2_data.json"


    train_examples_file = args.cache_data + "/train-examples.pkl"
    dev_examples_file = args.cache_data + "/dev-examples.pkl"

    if not os.path.exists(train_examples_file):
        train_examples = reader.read_examples(train_src, data_type='train')
        dev_examples = reader.read_examples(dev_src, data_type='dev')
        save(train_examples_file, train_examples, message="train examples")
        save(dev_examples_file, dev_examples, message="dev examples")
    else:
        logging.info('loading train cache_data {}'.format(train_examples_file))
        logging.info('loading dev cache_data {}'.format(dev_examples_file))
        train_examples, dev_examples = load(train_examples_file), load(dev_examples_file)

        logging.info('train examples size is {}'.format(len(train_examples)))
        logging.info('dev examples size is {}'.format(len(dev_examples)))

    convert_examples_features = Feature(max_len=args.max_len, spo_config=spo_config, tokenizer=tokenizer)

    train_examples = train_examples[:2] if debug else train_examples
    dev_examples = dev_examples[:2] if debug else dev_examples

    train_data_set = convert_examples_features(train_examples, data_type='train')
    dev_data_set = convert_examples_features(dev_examples, data_type='dev')
    train_data_loader = train_data_set.get_dataloader(args.train_batch_size, shuffle=True, pin_memory=args.pin_memory)
    dev_data_loader = dev_data_set.get_dataloader(args.train_batch_size)

    data_loaders = train_data_loader, dev_data_loader
    eval_examples = train_examples, dev_examples

    return eval_examples, data_loaders, tokenizer
Пример #3
0
    from data.xpath_file import JianDanXpath
    from data.url_file import JianDanUrl
    from lxml import etree
    import html as hhh
    from utils import file_util
    import re
    req = RequestFactory()
    html = req.getRequest(url=JianDanUrl.index_url)
    index_data = req.find(html, JianDanXpath.index)
    title_list = []
    for i in index_data:
        temp = {}
        temp['title'] = req.find(i, JianDanXpath.index_title)[0]
        temp['href'] = req.find(i, JianDanXpath.index_title_link)[0]
        title_list.append(temp)
    print(title_list.__str__())

    for i in title_list:
        temp_html = req.getRequest(url=i['href'])
        i['content'] = hhh.unescape(str(etree.tostring(req.find(temp_html, JianDanXpath.content)[0], encoding='utf-8'), encoding='utf-8'))
        pt = r'<p>.*</p>'
        rept = re.compile(pt)
        temp = re.findall(rept, i['content'])
        s = ''
        for j in range(temp.__len__()):
            s = s + temp[j]
        i['result'] = s

    for i in title_list:
        file_util.save(r'JianDan\\' + i['title'] + '.html', i['result'])