if __name__ == '__main__': ds = None ds = None if FLAGS.dataset == 'STS': print('Using the STS dataset') ds = STS() elif FLAGS.dataset == 'STSLarge': print('Using the STSLarge dataset') ds = STSLarge() elif FLAGS.dataset == 'PPDB': print('Using the PPDB dataset') ds = PPDB() elif FLAGS.dataset == 'Quora': print('Using the Quora dataset') ds = Quora() elif FLAGS.dataset == 'Sick': print('Using the Sick dataset') ds = Sick() elif FLAGS.dataset == 'SemEval': print('Using the SemEval dataset') ds = SemEval() elif FLAGS.dataset == 'StackExchange': print('Using the StackExchange dataset') ds = StackExchange() else: raise NotImplementedError('Dataset {} has not been ' 'implemented yet'.format(FLAGS.dataset)) if FLAGS.mode == 'train': train(ds, ds.metadata_path, ds.w2v)
The idea of this simple code is to load and dump text to our trained models. """ import collections import datasets from datasets import Quora from datasets import seq2id from datasets import merge_sentences import tflearn from models import AttentionBlstmQuora quora = Quora() Batch = collections.namedtuple('Batch', ['s1', 's2', 'sim']) def get_sents_encoded(sentence_1, sentence_2, dt=quora): data = [ datasets.tokenize(sentence_1, lang='en'), datasets.tokenize(sentence_2, lang='en') ] vocab_is = dt.w2i lst_sent_ids = seq2id(data, vocab_is, seq_begin=False, seq_end=False) s1_ids = lst_sent_ids[0] s2_ids = lst_sent_ids[1] return s1_ids, s2_ids
import random import numpy as np engine = get_engine() seed = 4269666 torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # Constant definition device = torch.device("cuda:2") # Le probleme vient du count vectorizer qui vire certains mots print("Load Dataset") dataset = Quora.torch_dataset() dataclasses = Quora.dataclasses() dataclasses = {q._id: q for q in dataclasses} def embedding_collate_decorator(collate_fn): def wrapper(batch): x, y, id_, qrels, seq_lens = collate_fn(batch) return x, y, id_, qrels, seq_lens return wrapper collate_fn = embedding_collate_decorator(sequence_collate_fn) train_len, val_len = int(0.7 * len(dataset)), int(0.15 * len(dataset))
import sys import os from os import path libpath = path.normpath( path.join(path.dirname(path.realpath(__file__)), os.pardir, "src")) sys.path.append(libpath) import pickle as pkl import torch import data from datasets import Quora, Robust2004 sys.modules["dataset"] = data quora_dc = Quora.dataclasses() quora_torch = Quora.torch_dataset() rb_dc = Robust2004.dataclasses() rb_torch = Robust2004.torch_dataset() del sys.modules["dataset"] with open(Quora.dataclasses_path, "wb") as f: pkl.dump(quora_dc, f) with open(Robust2004.dataclasses_path, "wb") as f: pkl.dump(rb_dc, f) torch.save(quora_torch, Quora.torch_path) torch.save(rb_torch, Robust2004.torch_path)