Exemplo n.º 1
0
def load_data(dataset, n_train, n_val):
    print("Loading embeddings... This can take a while the first time.")
    return encode_sentiment_data(
        dataset,
        embeddings.GloveEmbedding("wikipedia_gigaword",
                                  d_emb=EMBEDDING_SIZE,
                                  show_progress=True),
        n_train,
        n_val,
    )
Exemplo n.º 2
0
    def __init__(self, args):
        super().__init__()
        self.glove = E.GloveEmbedding('wikipedia_gigaword', 300, default='zero')
        ### Start of your code
        self.linear1 = nn.Linear(300, 100)
        self.tanh = nn.Tanh()
        self.linear2 = nn.Linear(100, 1)
        self.sigmoid = nn.Sigmoid()
        ### End of your code

        # do not touch this line below
        self.optim = torch.optim.Adam(self.parameters(), args.learning_rate)
Exemplo n.º 3
0
    def __init__(self, args):
        super().__init__()
        self.glove = E.GloveEmbedding('wikipedia_gigaword',
                                      300,
                                      default='zero')
        self.lossfunction = torch.nn.BCELoss()
        ### Start of your code
        self.fullyConnectedOne = torch.nn.Sequential(torch.nn.Linear(300, 100),
                                                     torch.nn.Tanh())

        self.outputLayer = torch.nn.Sequential(torch.nn.Linear(100, 1),
                                               torch.nn.Sigmoid())

        # do not touch this line below
        self.optim = torch.optim.Adam(self.parameters(), args.learning_rate)
Exemplo n.º 4
0
    def from_file(cls, root, dspider, dcache, debug=False):
        train_database, dev_database = editsql_preprocess.read_db_split(dspider)
        conv = converter.Converter()
        kmaps = evaluation.build_foreign_key_map_from_json(os.path.join(dspider, 'tables.json'))

        splits = {}
        for k in ['train', 'dev']:
            with open(os.path.join(root, '{}.json'.format(k)), 'rb') as f:
                splits[k] = []
                for ex in json.load(f):
                    splits[k].append(ex)
                    if debug and len(splits[k]) > 100:
                        break
    
        tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL, cache_dir=dcache)

        sql_voc = Vocab(['PAD', 'EOS', 'GO', 'SEP', '`', "'", '1', '%', 'yes', '2', '.', '5', 'f', 'm', 'name', 'song', 't', 'l'])

        # make contexts and populate vocab
        for s, data in splits.items():
            proc = []
            for i, ex in enumerate(tqdm.tqdm(data, desc='preprocess {}'.format(s))):
                for turn_i, turn in enumerate(ex['interaction']):
                    turn['id'] = '{}/{}:{}'.format(ex['database_id'], i, turn_i)
                    turn['db_id'] = ex['database_id']
                    turn['prev'] = ex['interaction'][turn_i-1] if turn_i > 0 else None
                    new = cls.make_example(turn, tokenizer, sql_voc, kmaps, conv, train=s=='train')
                    if new is not None and (s != 'train' or not new['invalid']):
                        proc.append(new)
            splits[s] = proc
    
        # make candidate list using vocab
        for s, data in splits.items():
            for ex in data:
                ex['cands_query'], ex['cands_value'] = cls.make_cands(ex, sql_voc)
            splits[s] = data
    
        # make pointers for training data
        for ex in splits['train']:
            ex['pointer_query'], ex['pointer_value'] = cls.make_query_pointer(ex['sup_query'], ex['cands_query'], ex['cands_value'], sql_voc)
    
        # look up pretrained word embeddings
        emb = E.ConcatEmbedding([E.GloveEmbedding(), E.KazumaCharEmbedding()], default='zero')
        sql_emb = torch.tensor([emb.emb(w) for w in sql_voc._index2word])
        ext = dict(sql_voc=sql_voc, sql_emb=sql_emb)
        return splits, ext
Exemplo n.º 5
0
    def from_file(cls, root, dcache, debug=False):
        conv = converter.Converter(os.path.join(root, 'tables.json'))

        splits = {}
        for k in ['train', 'dev']:
            with open(os.path.join(root, '{}.json'.format(k)), 'rb') as f:
                splits[k] = []
                for ex in json.load(f):
                    ex['query_orig'] = ex['query']
                    splits[k].append(ex)
                    if debug and len(splits[k]) > 100:
                        break
    
        tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL, cache_dir=dcache)

        utt_voc = Vocab(['PAD', 'EOS', 'GO'])

        # make contexts and populate vocab
        for s, data in splits.items():
            proc = []
            for i, ex in enumerate(tqdm.tqdm(data, desc='preprocess {}'.format(s))):
                ex['id'] = '{}/{}'.format(ex['db_id'], i)
                new = cls.make_example(ex, tokenizer, utt_voc, conv, train=s=='train')
                if new is not None and (s != 'train' or not new['invalid']):
                    proc.append(new)
            splits[s] = proc
    
        # make candidate list using vocab
        for s, data in splits.items():
            for ex in data:
                ex['cands_question'] = cls.make_cands(ex, utt_voc)
            splits[s] = data
    
        # make pointers for training data
        for ex in splits['train']:
            ex['pointer_question'] = cls.make_question_pointer(ex['sup_question'], ex['cands_question'], utt_voc)
    
        # look up pretrained word embeddings
        emb = E.ConcatEmbedding([E.GloveEmbedding(), E.KazumaCharEmbedding()], default='zero')
        utt_emb = torch.tensor([emb.emb(w) for w in utt_voc._index2word])
        ext = dict(utt_voc=utt_voc, utt_emb=utt_emb)
        return splits, ext
Exemplo n.º 6
0
    )
    print(f"missing pre-trained embedding for {len(unks)} unknown words")

    return (X_train, y_train), (X_val, y_val)


if __name__ == "__main__":
    train_size = 450
    validation_size = 100
    learning_rate = 0.01
    max_epochs = 250

    (X_train, y_train), (X_val, y_val) = encode_sentiment_data(
        load_dataset("glue", "sst2"),
        embeddings.GloveEmbedding("wikipedia_gigaword",
                                  d_emb=50,
                                  show_progress=True),
        train_size,
        validation_size,
    )
    model_trainer = SentenceSentimentTrain(
        CNNSentimentKim(feature_map_size=100,
                        filter_sizes=[3, 4, 5],
                        dropout=0.25))
    model_trainer.train(
        (X_train, y_train),
        learning_rate,
        max_epochs=max_epochs,
        data_val=(X_val, y_val),
    )
Exemplo n.º 7
0
#!/usr/bin/env python
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import random
import time
from nltk.corpus import brown
import tqdm
import embeddings as E


if __name__ == '__main__':
    random.seed(0)
    n_samples = 10000
    k1 = E.KazumaCharEmbedding(check_same_thread=True)
    k2 = E.KazumaCharEmbedding(check_same_thread=False)

    g1 = E.GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, check_same_thread=True)
    g2 = E.GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, check_same_thread=False)

    for w in ['canada', 'vancouver', 'toronto']:
        assert(k1.emb(w) == k2.emb(w))
        assert(g1.emb(w) == g2.emb(w))

        
Exemplo n.º 8
0
    if not os.path.isdir(dout):
        os.makedirs(dout)

    print('Flattening train')
    train = create_split(train_trees, vocab)
    print('Flattening dev')
    dev = create_split(dev_trees, vocab)

    par = joblib.Parallel(12)
    print('Segmenting train')
    train_ba = par(joblib.delayed(segment)(ex, vocab) for ex in tqdm(train))

    train_filtered = []
    for ex, ba in zip(train, train_ba):
        if ba:
            ex.update(ba)
            train_filtered.append(ex)

    print('filtered train from {} to {}'.format(len(train),
                                                len(train_filtered)))
    print('vocab size {}'.format(len(vocab)))

    emb = embeddings.ConcatEmbedding(
        [embeddings.GloveEmbedding(),
         embeddings.KazumaCharEmbedding()],
        default='zero')
    mat = torch.Tensor([emb.emb(w) for w in vocab._index2word])
    torch.save({'vocab': vocab, 'emb': mat}, dout + '/vocab.pt')
    torch.save(train_filtered, dout + '/proc_train.pt')
    torch.save(dev, dout + '/proc_dev.pt')
Exemplo n.º 9
0
        train_trees = json.load(f)
    with open('sharc/trees_dev.json') as f:
        dev_trees = json.load(f)
    dout = 'sharc/editor_disjoint'
    if not os.path.isdir(dout):
        os.makedirs(dout)

    print('Flattening train')
    train = create_split(train_trees, vocab)
    print('Flattening dev')
    dev = create_split(dev_trees, vocab)

    par = joblib.Parallel(12)
    print('Segmenting train')
    train_ba = par(joblib.delayed(segment)(ex, vocab) for ex in tqdm(train))

    train_filtered = []
    for ex, ba in zip(train, train_ba):
        if ba:
            ex.update(ba)
            train_filtered.append(ex)

    print('filtered train from {} to {}'.format(len(train), len(train_filtered)))
    print('vocab size {}'.format(len(vocab)))

    emb = embeddings.ConcatEmbedding([embeddings.GloveEmbedding(), embeddings.KazumaCharEmbedding()], default='zero')
    mat = torch.Tensor([emb.emb(w) for w in vocab._index2word])
    torch.save({'vocab': vocab, 'emb': mat}, dout + '/vocab.pt')
    torch.save(train_filtered, dout + '/proc_train.pt')
    torch.save(dev, dout + '/proc_dev.pt')
Exemplo n.º 10
0
#!/usr/bin/env python
import random
import time
from nltk.corpus import brown
import tqdm
import embeddings as E

if __name__ == '__main__':
    random.seed(0)
    n_samples = 10000
    emb = E.GloveEmbedding()
    times = []
    vocab = list(brown.words())
    samples = [random.choice(vocab) for i in range(n_samples)]

    for w in tqdm.tqdm(samples):
        start = time.time()
        emb.emb(w)
        end = time.time()
        times.append(end - start)
    print(sum(times) / len(times))