예제 #1
0
    def from_file(cls, root, dspider, dcache, debug=False):
        train_database, dev_database = editsql_preprocess.read_db_split(dspider)
        conv = converter.Converter()
        kmaps = evaluation.build_foreign_key_map_from_json(os.path.join(dspider, 'tables.json'))

        splits = {}
        for k in ['train', 'dev']:
            with open(os.path.join(root, '{}.json'.format(k)), 'rb') as f:
                splits[k] = []
                for ex in json.load(f):
                    splits[k].append(ex)
                    if debug and len(splits[k]) > 100:
                        break
    
        tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL, cache_dir=dcache)

        sql_voc = Vocab(['PAD', 'EOS', 'GO', 'SEP', '`', "'", '1', '%', 'yes', '2', '.', '5', 'f', 'm', 'name', 'song', 't', 'l'])

        # make contexts and populate vocab
        for s, data in splits.items():
            proc = []
            for i, ex in enumerate(tqdm.tqdm(data, desc='preprocess {}'.format(s))):
                for turn_i, turn in enumerate(ex['interaction']):
                    turn['id'] = '{}/{}:{}'.format(ex['database_id'], i, turn_i)
                    turn['db_id'] = ex['database_id']
                    turn['prev'] = ex['interaction'][turn_i-1] if turn_i > 0 else None
                    new = cls.make_example(turn, tokenizer, sql_voc, kmaps, conv, train=s=='train')
                    if new is not None and (s != 'train' or not new['invalid']):
                        proc.append(new)
            splits[s] = proc
    
        # make candidate list using vocab
        for s, data in splits.items():
            for ex in data:
                ex['cands_query'], ex['cands_value'] = cls.make_cands(ex, sql_voc)
            splits[s] = data
    
        # make pointers for training data
        for ex in splits['train']:
            ex['pointer_query'], ex['pointer_value'] = cls.make_query_pointer(ex['sup_query'], ex['cands_query'], ex['cands_value'], sql_voc)
    
        # look up pretrained word embeddings
        emb = E.ConcatEmbedding([E.GloveEmbedding(), E.KazumaCharEmbedding()], default='zero')
        sql_emb = torch.tensor([emb.emb(w) for w in sql_voc._index2word])
        ext = dict(sql_voc=sql_voc, sql_emb=sql_emb)
        return splits, ext
예제 #2
0
    def from_file(cls, root, dcache, debug=False):
        conv = converter.Converter(os.path.join(root, 'tables.json'))

        splits = {}
        for k in ['train', 'dev']:
            with open(os.path.join(root, '{}.json'.format(k)), 'rb') as f:
                splits[k] = []
                for ex in json.load(f):
                    ex['query_orig'] = ex['query']
                    splits[k].append(ex)
                    if debug and len(splits[k]) > 100:
                        break
    
        tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL, cache_dir=dcache)

        utt_voc = Vocab(['PAD', 'EOS', 'GO'])

        # make contexts and populate vocab
        for s, data in splits.items():
            proc = []
            for i, ex in enumerate(tqdm.tqdm(data, desc='preprocess {}'.format(s))):
                ex['id'] = '{}/{}'.format(ex['db_id'], i)
                new = cls.make_example(ex, tokenizer, utt_voc, conv, train=s=='train')
                if new is not None and (s != 'train' or not new['invalid']):
                    proc.append(new)
            splits[s] = proc
    
        # make candidate list using vocab
        for s, data in splits.items():
            for ex in data:
                ex['cands_question'] = cls.make_cands(ex, utt_voc)
            splits[s] = data
    
        # make pointers for training data
        for ex in splits['train']:
            ex['pointer_question'] = cls.make_question_pointer(ex['sup_question'], ex['cands_question'], utt_voc)
    
        # look up pretrained word embeddings
        emb = E.ConcatEmbedding([E.GloveEmbedding(), E.KazumaCharEmbedding()], default='zero')
        utt_emb = torch.tensor([emb.emb(w) for w in utt_voc._index2word])
        ext = dict(utt_voc=utt_voc, utt_emb=utt_emb)
        return splits, ext
예제 #3
0
    if not os.path.isdir(dout):
        os.makedirs(dout)

    print('Flattening train')
    train = create_split(train_trees, vocab)
    print('Flattening dev')
    dev = create_split(dev_trees, vocab)

    par = joblib.Parallel(12)
    print('Segmenting train')
    train_ba = par(joblib.delayed(segment)(ex, vocab) for ex in tqdm(train))

    train_filtered = []
    for ex, ba in zip(train, train_ba):
        if ba:
            ex.update(ba)
            train_filtered.append(ex)

    print('filtered train from {} to {}'.format(len(train),
                                                len(train_filtered)))
    print('vocab size {}'.format(len(vocab)))

    emb = embeddings.ConcatEmbedding(
        [embeddings.GloveEmbedding(),
         embeddings.KazumaCharEmbedding()],
        default='zero')
    mat = torch.Tensor([emb.emb(w) for w in vocab._index2word])
    torch.save({'vocab': vocab, 'emb': mat}, dout + '/vocab.pt')
    torch.save(train_filtered, dout + '/proc_train.pt')
    torch.save(dev, dout + '/proc_dev.pt')