예제 #1
0
    def __init__(self,
                 max_sent_length,
                 max_sent_src,
                 max_sent_trg,
                 data_folder,
                 model_folder,
                 pretrain_path,
                 prefix,
                 source_file,
                 target_file,
                 use_gensen_w2i,
                 device_ids=[0],
                 data_parallelize=False,
                 test=False):
        """
        :param max_sent_length: max words in sentence
               gensen_h --> batch size x max_len x rep_size
        :param max_sent_src: number of sentences in source doc
        :param max_sent_trg: number of sentences in target doc
        :param data_folder: data location
        :param model_folder: location of pretrained gensen
        :param pretrain_path: location of pretrained embeddings (e.g. Glove)
        :param prefix: used of the type of gensen ["nli_large"+"bothskip"+"arxiv"]
        :param source_file: name of source file in data_folder
        :param target_file: name of target file in data_folder
        :param use_gensen_w2i: use the word to ids for pretrained gensen
        :param device_ids: used when data_parallelize = True, specify devices to use
        :param data_parallelize:
        :param test:
        """
        self.max_len = max_sent_length  # max words
        self.max_sent_src = max_sent_src  # max sentences src
        self.max_sent_trg = max_sent_trg  # max sentences trg
        self.data_folder = data_folder
        self.source_file = source_file
        self.target_file = target_file
        self.src_data = []
        self.atrg_data = []
        self.data_parallelize = data_parallelize
        self.device_ids = device_ids
        self.test = test

        logging.debug(""" max_len: {}, max_sent_src: {}, max_sent_trg: {},
                data_folder: {}, source_file: {}, target_file: {}
            """.format(self.max_len, self.max_sent_src, self.max_sent_trg,
                       self.data_folder, self.source_file, self.target_file))
        self.gensen = GenSenSingle(model_folder=model_folder,
                                   filename_prefix=prefix,
                                   pretrained_emb=pretrain_path,
                                   cuda=True,
                                   max_sentence_length=max_sent_length,
                                   data_parallelize=data_parallelize,
                                   device_ids=device_ids[::-1])
        self.sen_rep_dim = self.gensen.sen_rep_dim
        self.vocab_size = self.gensen.vocab_size
        self.emb_dim = self.gensen.embedding_dim
        self.vocab_expansion(use_gensen_w2i)
    def __init__(self):
        """Initalizes object
        """
        self.__encoder = GenSenSingle(
            model_folder=os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'models'),
            filename_prefix='nli_large',
            pretrained_emb=os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'embedding', 'glove.840B.300d.h5')
        )

        with open(os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'models', 'senteval.pickle'), 'rb') as file:
            self.__evaluator = pickle.load(file)

        self.__mutex = Lock()
예제 #3
0
def get_gensen_synset_definitions(entity_file, vocab_file, gensen_file):
    from gensen import GenSen, GenSenSingle

    gensen_1 = GenSenSingle(
        model_folder='./data/models',
        filename_prefix='nli_large_bothskip',
        pretrained_emb='./data/embedding/glove.840B.300d.h5')
    gensen_1.eval()

    definitions = {}
    with open(entity_file, 'r') as fin:
        for line in fin:
            node = json.loads(line)
            if node['type'] == 'synset':
                definitions[node['id']] = node['definition']

    with open(vocab_file, 'r') as fin:
        vocab_list = fin.read().strip().split('\n')

    # get the descriptions
    sentences = [''] * NUM_EMBEDDINGS
    for k, entity in enumerate(vocab_list):
        definition = definitions.get(entity)
        if definition is None:
            assert entity in ('@@UNKNOWN@@', '@@MASK@@', '@@NULL@@')
        else:
            sentences[k + 1] = definition

    embeddings = np.zeros((NUM_EMBEDDINGS, 2048), dtype=np.float32)
    for k in range(0, NUM_EMBEDDINGS, 32):
        sents = sentences[k:(k + 32)]
        reps_h, reps_h_t = gensen_1.get_representation(sents,
                                                       pool='last',
                                                       return_numpy=True,
                                                       tokenize=True)
        embeddings[k:(k + 32), :] = reps_h_t
        print(k)

    with h5py.File(gensen_file, 'w') as fout:
        ds = fout.create_dataset('gensen', data=embeddings)
예제 #4
0
    def __init__(self, train_loader, validation_loader, test_loader, device):
        self.train_loader = train_loader
        self.validation_loader = validation_loader
        self.test_loader = test_loader

        self.clip = 5

        self.latent_size = 2048
        self.decoder_hidden_size = 2048

        self.decoder_layers = 2

        self.noise = Normal(torch.tensor([0.0], requires_grad=False),
                            torch.tensor([0.12], requires_grad=False))

        self.encoder = GenSenSingle(
            model_folder='./data/models',
            filename_prefix='nli_large_bothskip',
            pretrained_emb='./data/embedding/glove.840B.300d.h5',
            cuda=torch.cuda.is_available())
        vocab_size = len(self.encoder.encoder.src_embedding.weight)
        self.encoder.encoder.to(device)
        self.decoder = Decoder(self.decoder_hidden_size,
                               self.latent_size,
                               vocab_size,
                               self.decoder_layers,
                               device=device,
                               clip=5).to(device)

        weight_mask = torch.ones(vocab_size).cuda()
        weight_mask[self.encoder.word2id['<pad>']] = 0
        self.criterion = nn.CrossEntropyLoss(weight=weight_mask).cuda()
        self.bce = nn.BCELoss()
        self.device = device

        self.embedding_norms = torch.norm(
            self.encoder.encoder.src_embedding.weight, 1)

        print(self.decoder)
예제 #5
0
	# W2V_PATH = '/Users/karanjani/Desktop/InferSent-master/dataset/fastText/crawl-300d-2M.vec' #ENTER PATH TO FASTTEXT
	# infersent.set_w2v_path(W2V_PATH)

	# infersent.build_vocab(cleanedStrings, tokenize=True)
	# embeddings = infersent.encode(cleanedStrings, tokenize=True)

	# fbvecFrame = pd.DataFrame(list(embeddings)) #converting Facebook embeddings tuple to dataframe
	# FBcols = ["FB%d" % d for d in range(4096)] #creating list of column names for Facebook vectors
	# fbvecFrame.columns = FBcols #reset column names to be FB1, FB2 ... FB4096 
	# fullFrame = pd.concat([df, fbvecFrame], axis=1) #creating new dataframe with Facebook vectors

	################################
	###### GENSEN EMBEDDINGS ######
	################################
	gensen_1 = GenSenSingle(model_folder='/Users/karanjani/Desktop/gensen/data/models',filename_prefix='nli_large_bothskip',pretrained_emb='/Users/karanjani/Desktop/gensen/data/embedding/glove.840B.300d.h5')
	gensen_2 = GenSenSingle(model_folder='/Users/karanjani/Desktop/gensen/data/models',filename_prefix='nli_large_bothskip_parse',pretrained_emb='/Users/karanjani/Desktop/gensen/data/embedding/glove.840B.300d.h5')
	# reps_h, reps_h_t = gensen_1.get_representation(messageString, pool='last', return_numpy=True, tokenize=True)
	gensen = GenSen(gensen_1, gensen_2)
	reps_h, reps_h_t = gensen.get_representation(cleanedStrings, pool='last', return_numpy=True, tokenize=True)
	

	gsvecFrame = pd.DataFrame(reps_h_t)
	GScols = ["GS%d" % d for d in range(4096)]
	gsvecFrame.columns = GScols
	# fullFrame = pd.concat([fullFrame, gsvecFrame], axis=1)
	fullFrame = pd.concat([df, gsvecFrame], axis=1)

	################################
	###### GOOGLE EMBEDDINGS ######
	################################
예제 #6
0
class EncodingIteratorBase(DataIterator):
    """ Base generator class of sentence encodings."""
    def __init__(self,
                 max_sent_length,
                 max_sent_src,
                 max_sent_trg,
                 data_folder,
                 model_folder,
                 pretrain_path,
                 prefix,
                 source_file,
                 target_file,
                 use_gensen_w2i,
                 device_ids=[0],
                 data_parallelize=False,
                 test=False):
        """
        :param max_sent_length: max words in sentence
               gensen_h --> batch size x max_len x rep_size
        :param max_sent_src: number of sentences in source doc
        :param max_sent_trg: number of sentences in target doc
        :param data_folder: data location
        :param model_folder: location of pretrained gensen
        :param pretrain_path: location of pretrained embeddings (e.g. Glove)
        :param prefix: used of the type of gensen ["nli_large"+"bothskip"+"arxiv"]
        :param source_file: name of source file in data_folder
        :param target_file: name of target file in data_folder
        :param use_gensen_w2i: use the word to ids for pretrained gensen
        :param device_ids: used when data_parallelize = True, specify devices to use
        :param data_parallelize:
        :param test:
        """
        self.max_len = max_sent_length  # max words
        self.max_sent_src = max_sent_src  # max sentences src
        self.max_sent_trg = max_sent_trg  # max sentences trg
        self.data_folder = data_folder
        self.source_file = source_file
        self.target_file = target_file
        self.src_data = []
        self.atrg_data = []
        self.data_parallelize = data_parallelize
        self.device_ids = device_ids
        self.test = test

        logging.debug(""" max_len: {}, max_sent_src: {}, max_sent_trg: {},
                data_folder: {}, source_file: {}, target_file: {}
            """.format(self.max_len, self.max_sent_src, self.max_sent_trg,
                       self.data_folder, self.source_file, self.target_file))
        self.gensen = GenSenSingle(model_folder=model_folder,
                                   filename_prefix=prefix,
                                   pretrained_emb=pretrain_path,
                                   cuda=True,
                                   max_sentence_length=max_sent_length,
                                   data_parallelize=data_parallelize,
                                   device_ids=device_ids[::-1])
        self.sen_rep_dim = self.gensen.sen_rep_dim
        self.vocab_size = self.gensen.vocab_size
        self.emb_dim = self.gensen.embedding_dim
        self.vocab_expansion(use_gensen_w2i)

    def vocab_expansion(self, use_gensen_w2i):
        """ Read data from files."""
        if self.test: logging.debug(" Testing with 100 documents")
        files = [self.source_file, self.target_file]
        data = [self.src_data, self.atrg_data]
        maxes_sen = [self.max_sent_src, self.max_sent_trg]

        for file, dt, max_sen in zip(files, data, maxes_sen):
            with open('%s/%s' % (self.data_folder, file),
                      'r',
                      encoding="utf-8") as source:
                doc = []
                for sentence in source:
                    if doc and sentence.startswith("\n"):
                        if len(doc) > max_sen: doc = doc[0:max_sen]
                        dt.append(doc)
                        doc = []
                    elif sentence.strip():
                        doc.append(sentence.strip())
                    if self.test and len(dt) > test_num_docs: break
        self.num_docs = len(self.src_data)
        assert self.num_docs == len(self.atrg_data)
        logging.info(" Constructing vocabulary...")

        if use_gensen_w2i:  # if True does not construct a new vocab
            self.word2id = self.gensen.word2id
            self.id2word = self.gensen.id2word
        else:
            self.word2id, self.id2word = self.construct_vocab(
                list(chain.from_iterable(self.src_data)) +
                list(chain.from_iterable(self.atrg_data)), self.vocab_size)
        self.gensen.vocab_expansion(self.word2id.keys())
        self.vocab_size = self.gensen.vocab_size
        logging.info(" Data has been read")
예제 #7
0
    'I am pleased to inform all of those that believe in a strong, fair and sound Immigration Policy that Mark Morgan will be joining the Trump Administration as the head of our hard working men and women of ICE. Mark is a true believer and American Patriot. He will do a great job!',
    'For too long, a small group in our nations Capital has reaped the rewards of government while the people have borne the cost. Washington flourished -- but the people did not share in its wealth. Politicians prospered -- but the jobs left, and the factories closed.'
]
obama = [
    'Condolences to the family of John Singleton. His seminal work, Boyz n the Hood, remains one of the most searing, loving portrayals of the challenges facing inner-city youth. He opened doors for filmmakers of color to tell powerful stories that have been too often ignored.',
    'This generation of climate activists is tired of inaction, and theyve caught the attention of leaders all over the world. So while this challenge is only getting more urgent, they show us the kind of action itll take to meet this moment.',
    'That we are in the midst of crisis is now well understood. Our nation is at war, against a far-reaching network of violence and hatred. Our economy is badly weakened, a consequence of greed and irresponsibility on the part of some, but also our collective failure to make hard choices and prepare the nation for a new age. Homes have been lost; jobs shed; businesses shuttered. Our health care is too costly; our schools fail too many; and each day brings further evidence that the ways we use energy strengthen our adversaries and threaten our planet.'
]
idx2speaker = [
    'trump1', 'trump2', 'trumpinaguration', 'obama1', 'obama2',
    'obamainaguration', 'shakespeare1', 'shakespeare2', 'wutang1', 'wutang2',
    'lukecombs', 'lukecombs'
]
sentences = trump + obama + shakespeare + lukecombs + wutang
gensen_1 = GenSenSingle(model_folder='./data/models',
                        filename_prefix='nli_large_bothskip',
                        pretrained_emb='./data/embedding/glove.840B.300d.h5')
reps_h, reps_h_t = gensen_1.get_representation(sentences,
                                               pool='last',
                                               return_numpy=True,
                                               tokenize=True)
x = []
for i in range(len(reps_h)):
    x.append(reps_h[i].mean(axis=0))

model = TSNE(n_components=2,
             perplexity=20,
             init='pca',
             method='exact',
             n_iter=5000)
x = model.fit_transform(x)
예제 #8
0
        s2vsingle[i].set_w2v_path(PATH_TO_W2V)
        s2vsingle[i] = s2vsingle[i].cuda()

    sent2vec = Sent2Vec(s2vsingle, 'concat')

    params_model = {'bsize': 64, 'pool_type': 'mean',
                    'which_layer': 'all',
                    'optfile': ELMO_OPTIONS,
                    'wgtfile': ELMO_WEIGHT}

    elmo = ELMo(params_model)
    elmo = elmo.cuda()

    gensen_1 = GenSenSingle(
        model_folder=FOLDER_PATH,
        filename_prefix=PREFIX1,
        pretrained_emb=PRETRAIN_EMB,
        cuda=True
    )
    gensen_2 = GenSenSingle(
        model_folder=FOLDER_PATH,
        filename_prefix=PREFIX2,
        pretrained_emb=PRETRAIN_EMB,
        cuda=True
    )
    gensen = GenSen(gensen_1, gensen_2)

    models = {
        'sent2vec': sent2vec,
        'elmo': elmo,
        'gensen': gensen
    }
    def __init__(
        self,
        train_vocab,
        n_movies,
        params,
    ):
        super(Recommender, self).__init__()
        self.params = params
        self.train_vocab = train_vocab
        self.n_movies = n_movies
        self.cuda_available = torch.cuda.is_available()

        # instantiate the gensen module that will be used in the encoder HRNN, and by the recommender module
        self.gensen = GenSenSingle(
            model_folder=os.path.join(config.MODELS_PATH, 'GenSen'),
            filename_prefix='nli_large',
            pretrained_emb=os.path.join(config.MODELS_PATH,
                                        'embeddings/glove.840B.300d.h5'),
            cuda=self.cuda_available)
        self.gensen.vocab_expansion(list(train_vocab))

        # HRNN encoder
        # Conversation encoder not bidirectional
        self.encoder = HRNN(params=params['hrnn_params'],
                            train_vocabulary=train_vocab,
                            gensen=self.gensen,
                            train_gensen=False,
                            conv_bidirectional=False)
        self.recommender_module = RecommendFromDialogue(
            params=params['recommend_from_dialogue_params'],
            train_vocab=train_vocab,
            n_movies=n_movies,
            gensen=self.gensen,
        )

        if params['language_aware_recommender']:
            self.language_to_user = nn.Linear(
                in_features=params['hrnn_params']
                ['conversation_encoder_hidden_size'],
                out_features=self.recommender_module.autorec.
                user_representation_size)
        # latent variable distribution parameters:
        latent_layer_sizes = params['latent_layer_sizes']
        if latent_layer_sizes is not None:
            latent_variable_size = latent_layer_sizes[-1]
            self.prior_hidden_layers = nn.ModuleList([
                nn.Linear(in_features=params['hrnn_params']
                          ['conversation_encoder_hidden_size'],
                          out_features=latent_layer_sizes[0])
                if i == 0 else nn.Linear(in_features=latent_layer_sizes[i - 1],
                                         out_features=latent_layer_sizes[i])
                for i in range(len(latent_layer_sizes) - 1)
            ])
            penultimate_size = params['hrnn_params']['conversation_encoder_hidden_size'] \
                if len(latent_layer_sizes) == 1 else latent_layer_sizes[-2]
            self.mu_prior = nn.Linear(penultimate_size, latent_variable_size)
            self.sigma_prior = nn.Linear(penultimate_size,
                                         latent_variable_size)

            # context size + size of sentence representations
            posterior_input_size = params['hrnn_params']['conversation_encoder_hidden_size'] +\
                                   2 * params['hrnn_params']['sentence_encoder_hidden_size'] + 1
            self.posterior_hidden_layers = nn.ModuleList([
                nn.Linear(in_features=posterior_input_size,
                          out_features=latent_layer_sizes[0])
                if i == 0 else nn.Linear(in_features=latent_layer_sizes[i - 1],
                                         out_features=latent_layer_sizes[i])
                for i in range(len(latent_layer_sizes) - 1)
            ])
            penultimate_size = posterior_input_size if len(
                latent_layer_sizes) == 1 else latent_layer_sizes[-2]
            self.mu_posterior = nn.Linear(penultimate_size,
                                          latent_variable_size)
            self.sigma_posterior = nn.Linear(penultimate_size,
                                             latent_variable_size)

        context_size = params['hrnn_params'][
            'conversation_encoder_hidden_size']
        if latent_layer_sizes is not None:
            context_size += latent_layer_sizes[-1]
        self.decoder = SwitchingDecoder(context_size=context_size,
                                        vocab_size=len(train_vocab),
                                        **params['decoder_params'])

        if self.cuda_available:
            self.cuda()
        self.decoder.set_pretrained_embeddings(
            self.encoder.gensen.encoder.src_embedding.weight.data)
        # NOTE: To decide the pooling strategy for a new model, note down the validation set scores below.
    )
    parser.add_argument("--cuda",
                        help="Use GPU to compute sentence representations",
                        default=torch.cuda.is_available())
    args = parser.parse_args()

    print('#############################')
    print('####### Parameters ##########')
    print('Prefix 1 : %s ' % (args.prefix_1))
    print('Prefix 2 : %s ' % (args.prefix_2))
    print('Pretrained Embeddings : %s ' % (args.pretrain))
    print('#############################')

    gensen_1 = GenSenSingle(model_folder=args.folder_path,
                            filename_prefix=args.prefix_1,
                            pretrained_emb=args.pretrain,
                            cuda=args.cuda)
    gensen_2 = GenSenSingle(model_folder=args.folder_path,
                            filename_prefix=args.prefix_2,
                            pretrained_emb=args.pretrain,
                            cuda=args.cuda)

    gensen = GenSen(gensen_1, gensen_2)
    params_senteval['gensen'] = gensen
    se = senteval.engine.SE(params_senteval, batcher, prepare)
    results_transfer = se.eval(transfer_tasks)

    print('--------------------------------------------')
    print('MR                [Dev:%.1f/Test:%.1f]' %
          (results_transfer['MR']['devacc'], results_transfer['MR']['acc']))
    print('CR                [Dev:%.1f/Test:%.1f]' %
예제 #11
0
def main(arguments):
    parser = argparse.ArgumentParser(description=__doc__,
                                     formatter_class=argparse.RawDescriptionHelpFormatter)

    # Logistics
    parser.add_argument("--gpu_id", help="gpu id to use", type=int, default=0)
    parser.add_argument("--seed", help="Random seed", type=int, default=19)
    parser.add_argument("--use_pytorch", help="1 to use PyTorch", type=int, default=0)
    parser.add_argument("--out_dir", help="Dir to write preds to", type=str, default='')
    parser.add_argument("--log_file", help="File to log to", type=str, default='')
    parser.add_argument("--load_data", help="0 to read data from scratch", type=int, default=1)

    # Model options
    parser.add_argument("--batch_size", help="Batch size to use", type=int, default=16)
    parser.add_argument("--model_dir", help="path to model folder")
    parser.add_argument("--prefix1", help="prefix to model 1", default='nli_large_bothskip_parse')
    parser.add_argument("--prefix2", help="prefix to model 2", default='nli_large_bothskip')
    parser.add_argument("--word_vec_file", help="path to pretrained vectors")
    parser.add_argument("--strategy", help="Approach to create sentence embedding last/max/best",
                        choices=["best", "max", "last"], default="best")

    # Task options
    parser.add_argument("--tasks", help="Tasks to evaluate on, as a comma separated list", type=str)
    parser.add_argument("--max_seq_len", help="Max sequence length", type=int, default=40)


    # Classifier options
    parser.add_argument("--cls_batch_size", help="Batch size to use for the classifier", type=int,
                        default=16)

    args = parser.parse_args(arguments)
    logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)
    log_file = os.path.join(args.out_dir, "results.log")
    fileHandler = logging.FileHandler(log_file)
    logging.getLogger().addHandler(fileHandler)
    logging.info(args)
    torch.cuda.set_device(args.gpu_id)

    # Set up SentEval
    params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': args.use_pytorch, 'kfold': 10,
            'max_seq_len': args.max_seq_len, 'batch_size': args.batch_size, 'load_data': args.load_data,
            'seed': args.seed}
    params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': args.cls_batch_size,
            'tenacity': 5, 'epoch_size': 4, 'cudaEfficient': True}

    # Load model
    # import GenSen package
    sys.path.insert(0, args.model_dir)
    from gensen import GenSen, GenSenSingle

    ckpt_dir = os.path.join(args.model_dir, "data", "models")
    gensen_1 = GenSenSingle(model_folder=ckpt_dir, filename_prefix=args.prefix1,
                            pretrained_emb=args.word_vec_file, cuda=bool(args.gpu_id >= 0))
    gensen_2 = GenSenSingle(model_folder=ckpt_dir, filename_prefix=args.prefix2,
                            pretrained_emb=args.word_vec_file, cuda=bool(args.gpu_id >= 0))
    gensen = GenSen(gensen_1, gensen_2)
    global STRATEGY
    STRATEGY = args.strategy
    params_senteval['gensen'] = gensen

    # Do SentEval stuff
    se = senteval.engine.SE(params_senteval, batcher, prepare)
    tasks = get_tasks(args.tasks)
    results = se.eval(tasks)
    write_results(results, args.out_dir)
    logging.info(results)
예제 #12
0
class Seq2SeqGAN:
    def __init__(self, dataset, device):
        self.dataset = dataset

        self.clip = 5

        self.latent_size = 2048
        self.decoder_hidden_size = 2048

        self.decoder_layers = 2

        self.noise = Normal(torch.tensor([0.0], requires_grad=False),
                            torch.tensor([0.12], requires_grad=False))

        self.encoder = GenSenSingle(
            model_folder='./data/models',
            filename_prefix='nli_large_bothskip',
            pretrained_emb='./data/embedding/glove.840B.300d.h5',
            cuda=torch.cuda.is_available())
        vocab_size = len(self.encoder.encoder.src_embedding.weight)
        self.encoder.encoder.to(device)
        self.decoder = Decoder(self.decoder_hidden_size,
                               self.latent_size,
                               vocab_size,
                               self.decoder_layers,
                               device=device,
                               clip=5).to(device)

        weight_mask = torch.ones(vocab_size).cuda()
        weight_mask[self.encoder.word2id['<pad>']] = 0
        self.criterion = nn.CrossEntropyLoss(weight=weight_mask).cuda()
        self.bce = nn.BCELoss()
        self.device = device

        self.embedding_norms = torch.norm(
            self.encoder.encoder.src_embedding.weight, 1)

        print(self.decoder)

    def print_step(self, input_tensor, lengths, decoder_outputs, losses, epoch,
                   i):

        # print out a medium sized tweet
        mid = int(BATCH_SIZE / 2)

        input_to_print = input_tensor[mid, :lengths[mid]].view(-1)
        output_to_print = decoder_outputs[mid, :lengths[mid], :]

        input_text = ' '.join(
            [self.encoder.id2word[int(i)] for i in input_to_print])
        output_text = self.unembed(output_to_print)

        print('{0:d} {1:d} l1: {2:.10f} l2: {3:.10f}'.format(
            epoch, i * BATCH_SIZE, losses[0], losses[1]))

        print(input_text)
        print(output_text)
        print(' ', flush=True)

    def get_loss(self, cropped_input, lengths, decoder_outputs, stops):

        l1 = self.criterion(
            decoder_outputs.contiguous().view(-1, decoder_outputs.size(2)),
            cropped_input.contiguous().view(-1))

        ideal_stops = torch.zeros_like(stops)
        for i, l in enumerate(lengths):
            if l <= ideal_stops.size(1):
                ideal_stops[i, l - 1] = 1
        l2 = self.bce(stops, ideal_stops)

        return l1, l2

    def train_step(self, input_tensor, lengths, optimizer_gen,
                   word_dropout_rate):
        optimizer_gen.zero_grad()

        encoder_outputs, encoder_hidden, embedded_input, lengths = self.encoder.get_representation_and_embedded_input(
            input_tensor, pool='last', return_numpy=False, tokenize=True)

        encoder_outputs = encoder_outputs.detach()
        encoder_hidden = encoder_hidden.detach()
        embedded_input = embedded_input.detach()
        lengths = lengths.to(device)
        lengths = lengths.detach()

        noise = self.noise.sample(
            encoder_hidden.size()).view_as(encoder_hidden).to(self.device)
        encoder_hidden += noise

        decoder_outputs, stops = self.decoder.forward(encoder_outputs,
                                                      encoder_hidden,
                                                      word_dropout_rate)

        # resize input to match decoder output (due to pre-empting decoder)
        cropped_input = embedded_input[:, :decoder_outputs.size(1)]

        l1, l2 = self.get_loss(cropped_input, lengths, decoder_outputs, stops)

        loss_gen = l1 + l2
        loss_gen.backward()
        optimizer_gen.step()

        losses = np.array([l1.item(), l2.item()])
        return losses, decoder_outputs.data, embedded_input.data, lengths.data

    def validation_step(self, input_tensor, lengths):
        encoder_outputs, encoder_hidden, embedded_input, lengths = self.encoder.get_representation_and_embedded_input(
            input_tensor, pool='last', return_numpy=False, tokenize=True)

        encoder_outputs = encoder_outputs.detach()
        encoder_hidden = encoder_hidden.detach()
        embedded_input = embedded_input.detach()
        lengths = lengths.to(device)
        lengths = lengths.detach()

        decoder_outputs, stops = self.decoder.forward(encoder_outputs,
                                                      encoder_hidden, 0)

        # resize input to match decoder output (due to pre-empting decoder)
        cropped_input = embedded_input[:, :decoder_outputs.size(1)]
        l1, l2 = self.get_loss(cropped_input, lengths, decoder_outputs, stops)

        losses = np.array([l1.item(), l2.item()])
        return losses, decoder_outputs.data, embedded_input.data, lengths.data

    def train(self,
              optimizer_gen,
              epochs=1,
              print_every=500,
              validate_every=50000,
              word_dropout_rate=0.0,
              best_validation_loss=np.inf,
              start_at=0):

        print('USING: {}'.format(self.device))

        validations_since_best = 0
        for epoch in range(epochs):

            print_total = np.array([0.0] * 2)

            for i in range(1500000):
                input_tensor, lengths = self.dataset[0]

                lengths = lengths.to(device)

                if len(input_tensor) != BATCH_SIZE:
                    break

                samples_processed = (epoch * BATCH_SIZE * 1500000) + (
                    (i + 1) * BATCH_SIZE) + start_at

                losses, decoder_outputs, embedded_input, lengths = self.train_step(
                    input_tensor, lengths, optimizer_gen, word_dropout_rate)

                print_total += losses

                if i > 0 and i % print_every == 0:
                    print_total /= print_every
                    self.print_step(embedded_input, lengths, decoder_outputs,
                                    print_total, epoch, i)

                    for y, l in zip(print_total, ['reconstruction', 'stops']):
                        vis.line(X=np.array([int(samples_processed)]),
                                 Y=np.array([[float(y)]]),
                                 win=l,
                                 opts=dict(title=l,
                                           xlabel='samples processed',
                                           ylabel='loss',
                                           legend=['train']),
                                 update='append')
                    print_total *= 0

                if i > 0 and i % validate_every == 0:
                    val = self.validate(print_every, samples_processed)

                    vis.line(X=np.array([int(samples_processed)]),
                             Y=np.array([[float(val / len(validation_loader))]
                                         ]),
                             win='validation',
                             opts=dict(title="validation",
                                       xlabel='samples processed',
                                       ylabel='loss',
                                       legend=['val']),
                             update='append')

                    if val < best_validation_loss:
                        best_validation_loss = val
                        validations_since_best = 0

                        save_checkpoint(self.decoder, optimizer_gen,
                                        samples_processed,
                                        best_validation_loss)

                    else:
                        validations_since_best += 1

                    print("{} SINCE LAST BEST VALIDATION".format(
                        validations_since_best))

                    if validations_since_best >= 100:
                        return

                del input_tensor
                del decoder_outputs
                del embedded_input

    def validate(self, print_every, samples_processed):
        print("VALIDATING")

        print_total = np.array([0.0] * 2)

        with torch.no_grad():
            for i in range(5000):
                input_tensor, lengths = self.dataset[0]
                # for (i, [input_tensor, lengths]) in enumerate(validation_loader):
                if len(input_tensor) != BATCH_SIZE:
                    break

                lengths = lengths.to(device)

                losses, decoder_outputs, embedded_input, lengths = self.validation_step(
                    input_tensor, lengths)

                print_total += losses

                if i > 0 and i % print_every == 0:

                    self.print_step(embedded_input, lengths, decoder_outputs,
                                    print_total / i, 0, i)

                del input_tensor
                del decoder_outputs
                del embedded_input

        print("AVERAGE VALIDATION LOSS: {}".format(
            float(print_total[0]) / len(validation_loader)))
        return float(print_total[0])

    # def unembed(self, decoder_outputs, length=MAX_LENGTH):
    #
    #     indices = [int(torch.argmax(
    #         torch.mm(self.encoder.encoder.src_embedding.weight,
    #                                      torch.unsqueeze(d, 1)[:EMBEDDING_SIZE])
    #         / self.embedding_norms
    #     )) for d in decoder_outputs]
    #     return ' '.join([self.encoder.id2word[i] for i in indices])

    def unembed(self, decoder_outputs, length=MAX_LENGTH):
        indices = [int(torch.argmax(d)) for d in decoder_outputs]
        # indices = [int(torch.argmin(
        #     torch.norm(self.encoder.encoder.src_embedding.weight - d[:EMBEDDING_SIZE], dim=1)
        # )) for d in decoder_outputs]
        return ' '.join([self.encoder.id2word[i] for i in indices])
예제 #13
0
        if i % 10 == 0:
            print("%d sentences done" % (i))
            # print("Adv embeddings shape: %s, adv_labels shape", len(sent_adv_embeddings), dim(adv_labels[i]))

    print("Adv embeddings shape: %s, adv_labels shape %s" %
          (len(adv_embeddings), len(adv_labels)))

    for i in range(0, len(adv_embeddings), 10):
        print("Adv embeddings shape: %s, adv_labels shape",
              len(adv_embeddings[i]), len(adv_labels[i]))
    return adv_embeddings, adv_labels, adv_batch_sentences


# Load GenSen model
gensen_1 = GenSenSingle(model_folder='../data/models',
                        filename_prefix='nli_large_bothskip',
                        pretrained_emb='fasttext/glove.840B.300d.h5')
gensen_2 = GenSenSingle(model_folder='../data/models',
                        filename_prefix='nli_large_bothskip_parse',
                        pretrained_emb='fasttext/glove.840B.300d.h5')
gensen_encoder = GenSen(gensen_1, gensen_2)

# reps_h, reps_h_t = gensen_encoder.get_representation(
#     sentences, pool='last', return_numpy=True, tokenize=True
# )

# Set params for SentEval
params_senteval = {
    'task_path': PATH_TO_DATA,
    'usepytorch': True,
    'kfold': 5,
class SemanticAnalyser(object):
    """Class for comparing sentences for entailment
    """
    def __init__(self):
        """Initalizes object
        """
        self.__encoder = GenSenSingle(
            model_folder=os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'models'),
            filename_prefix='nli_large',
            pretrained_emb=os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'embedding', 'glove.840B.300d.h5')
        )

        with open(os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'models', 'senteval.pickle'), 'rb') as file:
            self.__evaluator = pickle.load(file)

        self.__mutex = Lock()

    def get_entailments_with_levels(self, sentence, sentences):
        """Analyzes relation between a sentence and all in a collection

        Args:
            sentence: a sentence
            sentences: a non-empty list of sentences

        Returns:
            entailment:
                0 if entailed, 1 if neutral, 2 if contradicting for each element in sentences
            level:
                a non-negative value of how much this sentence is entailed with each element in sentences
        """
        self.__mutex.acquire()
        _, encoded = self.__encoder.get_representation([sentence] + sentences, pool='last', return_numpy=True, tokenize=True)
        input = np.concatenate((
            np.repeat([encoded[0]], len(sentences), axis=0),
            encoded[1:],
            (np.repeat([encoded[0]], len(sentences), axis=0)) * encoded[1:]), axis=1)
        output = self.__model_predict(input)
        self.__mutex.release()

        entailment = np.argmax(output, axis=1)

        level = np.max(output, axis=1) - np.transpose(output)[1]

        for i, sent in enumerate(sentences):
            if sentence == sent:
                entailment[i] = 0
                level[i] = 1e10

        return entailment, level

    def get_entailment(self, sentence1, sentence2):
        """Analyzes relation between two sentences

        Args:
            sentence1: first sentence as a string
            sentence2: second sentence as a string

        Returns:
            0 if entailed, 1 if neutral, 2 if contradicting
        """

        if sentence1 == sentence2:
            return 0

        self.__mutex.acquire()
        _, encoded = self.__encoder.get_representation([sentence1, sentence2], pool='last', return_numpy=True, tokenize=True)
        input = np.concatenate((encoded[0], encoded[1], encoded[0] * encoded[1]))
        output = self.__model_predict(np.array([input]))
        self.__mutex.release()

        return np.argmax(output)

    def __model_predict(self, input):
        sentence_size = input.shape[1] // 3
        batch_size = input.shape[0]
        switched_input = np.hstack((input[:, sentence_size:2*sentence_size], input[:, 0:sentence_size], input[:, 2*sentence_size:3*sentence_size]))

        input = np.vstack((input, switched_input))
        
        self.__evaluator.model.eval()
        input = torch.FloatTensor(input).cuda()
        yhat = []
        with torch.no_grad():
            for i in range(0, len(input), self.__evaluator.batch_size):
                x = input[i:i + self.__evaluator.batch_size]
                output = self.__evaluator.model(x)
                yhat.append(output.data.cpu().numpy())
        yhat = np.vstack(yhat)
        yhat = (yhat[0:batch_size, :] + yhat[batch_size:2*batch_size, :]) / 2
        return yhat
    reps_h, reps_h_t = gensen_1.get_representation(list_mystr,
                                                   pool='last',
                                                   return_numpy=True,
                                                   tokenize=True)
    vectors = reps_h_t.tolist()
    return vectors


@app.route('/get_embeddings/', methods=['POST'])
def home():
    sentences_list = list(request.json['sentences_list'])
    sentences_list = [
        x.lower().encode("unicode_escape").decode("utf8")
        for x in sentences_list
    ]

    if (not sentences_list):
        return "Arg \"sentences_list\", not found"
    vec = embeddings(sentences_list)
    # print(type(vec), len(vec))
    return jsonify(vectors=vec)


gensen_1 = GenSenSingle(
    model_folder='gensen/data/models',
    filename_prefix='nli_large_bothskip',
    pretrained_emb='gensen/data/embedding/glove.840B.300d.h5')

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=7654)
class Recommender(nn.Module):
    def __init__(
        self,
        train_vocab,
        n_movies,
        params,
    ):
        super(Recommender, self).__init__()
        self.params = params
        self.train_vocab = train_vocab
        self.n_movies = n_movies
        self.cuda_available = torch.cuda.is_available()

        # instantiate the gensen module that will be used in the encoder HRNN, and by the recommender module
        self.gensen = GenSenSingle(
            model_folder=os.path.join(config.MODELS_PATH, 'GenSen'),
            filename_prefix='nli_large',
            pretrained_emb=os.path.join(config.MODELS_PATH,
                                        'embeddings/glove.840B.300d.h5'),
            cuda=self.cuda_available)
        self.gensen.vocab_expansion(list(train_vocab))

        # HRNN encoder
        # Conversation encoder not bidirectional
        self.encoder = HRNN(params=params['hrnn_params'],
                            train_vocabulary=train_vocab,
                            gensen=self.gensen,
                            train_gensen=False,
                            conv_bidirectional=False)
        self.recommender_module = RecommendFromDialogue(
            params=params['recommend_from_dialogue_params'],
            train_vocab=train_vocab,
            n_movies=n_movies,
            gensen=self.gensen,
        )

        if params['language_aware_recommender']:
            self.language_to_user = nn.Linear(
                in_features=params['hrnn_params']
                ['conversation_encoder_hidden_size'],
                out_features=self.recommender_module.autorec.
                user_representation_size)
        # latent variable distribution parameters:
        latent_layer_sizes = params['latent_layer_sizes']
        if latent_layer_sizes is not None:
            latent_variable_size = latent_layer_sizes[-1]
            self.prior_hidden_layers = nn.ModuleList([
                nn.Linear(in_features=params['hrnn_params']
                          ['conversation_encoder_hidden_size'],
                          out_features=latent_layer_sizes[0])
                if i == 0 else nn.Linear(in_features=latent_layer_sizes[i - 1],
                                         out_features=latent_layer_sizes[i])
                for i in range(len(latent_layer_sizes) - 1)
            ])
            penultimate_size = params['hrnn_params']['conversation_encoder_hidden_size'] \
                if len(latent_layer_sizes) == 1 else latent_layer_sizes[-2]
            self.mu_prior = nn.Linear(penultimate_size, latent_variable_size)
            self.sigma_prior = nn.Linear(penultimate_size,
                                         latent_variable_size)

            # context size + size of sentence representations
            posterior_input_size = params['hrnn_params']['conversation_encoder_hidden_size'] +\
                                   2 * params['hrnn_params']['sentence_encoder_hidden_size'] + 1
            self.posterior_hidden_layers = nn.ModuleList([
                nn.Linear(in_features=posterior_input_size,
                          out_features=latent_layer_sizes[0])
                if i == 0 else nn.Linear(in_features=latent_layer_sizes[i - 1],
                                         out_features=latent_layer_sizes[i])
                for i in range(len(latent_layer_sizes) - 1)
            ])
            penultimate_size = posterior_input_size if len(
                latent_layer_sizes) == 1 else latent_layer_sizes[-2]
            self.mu_posterior = nn.Linear(penultimate_size,
                                          latent_variable_size)
            self.sigma_posterior = nn.Linear(penultimate_size,
                                             latent_variable_size)

        context_size = params['hrnn_params'][
            'conversation_encoder_hidden_size']
        if latent_layer_sizes is not None:
            context_size += latent_layer_sizes[-1]
        self.decoder = SwitchingDecoder(context_size=context_size,
                                        vocab_size=len(train_vocab),
                                        **params['decoder_params'])

        if self.cuda_available:
            self.cuda()
        self.decoder.set_pretrained_embeddings(
            self.encoder.gensen.encoder.src_embedding.weight.data)

    def reparametrize(self, mu, logvariance):
        """
        Sample the latent variable
        :param mu:
        :param logvar:
        :return:
        """
        std = torch.exp(0.5 * logvariance)
        tt = torch.cuda.FloatTensor if self.cuda_available else torch.FloatTensor
        eps = Variable(torch.randn(std.data.shape, out=tt()))
        return mu + eps * std

    def forward(self, input_dict, return_latent=False):
        # encoder result: (batch_size, max_conv_length, conversation_encoder_hidden_size)
        conversation_representations, sentence_representations = self.encoder(
            input_dict, return_all=True, return_sentence_representations=True)
        batch_size, max_conversation_length, max_utterance_length = input_dict[
            "dialogue"].data.shape

        # get movie_recommendations (batch, max_conv_length, n_movies)
        if self.params['language_aware_recommender']:
            user_rep_from_language = self.language_to_user(
                conversation_representations)
        movie_recommendations = self.recommender_module(
            dialogue=input_dict["dialogue"],
            senders=input_dict["senders"],
            lengths=input_dict["lengths"],
            conversation_lengths=input_dict["conversation_lengths"],
            movie_occurrences=input_dict["movie_occurrences"],
            recommend_new_movies=False,
            user_representation=user_rep_from_language
            if self.params['language_aware_recommender'] else None)

        # TODO: only decode recommender's utterances
        # Decoder:
        utterances = input_dict["dialogue"].view(
            batch_size * max_conversation_length, -1)
        lengths = input_dict["lengths"]
        # order by descending utterance length
        lengths = lengths.reshape((-1))
        sorted_lengths, sorted_idx, rev = sort_for_packed_sequence(
            lengths, cuda=self.cuda_available)

        sorted_utterances = utterances.index_select(0, sorted_idx)

        # shift the context vectors one step in time
        tt = torch.cuda.FloatTensor if self.cuda_available else torch.FloatTensor
        pad_tensor = (Variable(
            torch.zeros(
                batch_size,
                1,
                self.params['hrnn_params']['conversation_encoder_hidden_size'],
                out=tt())))
        conversation_representations = torch.cat(
            (pad_tensor, conversation_representations),
            1).narrow(1, 0, max_conversation_length)
        # and reshape+reorder the same way as utterances
        conversation_representations = conversation_representations.contiguous().view(
            batch_size * max_conversation_length, self.params['hrnn_params']['conversation_encoder_hidden_size'])\
            .index_select(0, sorted_idx)

        # shift the movie recommendations one step in time
        pad_tensor = (Variable(
            torch.zeros(batch_size, 1, self.n_movies, out=tt())))
        movie_recommendations = torch.cat((pad_tensor, movie_recommendations),
                                          1).narrow(1, 0,
                                                    max_conversation_length)
        # and reshape+reorder movie_recommendations the same way as utterances
        movie_recommendations = movie_recommendations.contiguous().view(
            batch_size * max_conversation_length,
            -1).index_select(0, sorted_idx)

        # consider only lengths > 0
        num_positive_lengths = np.sum(lengths > 0)
        sorted_utterances = sorted_utterances[:num_positive_lengths]
        sorted_lengths = sorted_lengths[:num_positive_lengths]
        conversation_representations = conversation_representations[:
                                                                    num_positive_lengths]
        movie_recommendations = movie_recommendations[:num_positive_lengths]

        # Latent variable
        if self.params['latent_layer_sizes'] is not None:
            # remember that conversation_representations have been shifted one step in time
            h_prior = conversation_representations
            for layer in self.prior_hidden_layers:
                h_prior = F.relu(layer(h_prior))
            mu_prior = self.mu_prior(h_prior)
            logvar_prior = self.sigma_prior(h_prior)
            # posterior conditioned on current context, and representation of the next utterance (that is the
            # utterance about to be decoded)
            # reshape sentence representations the same way as utterances
            sentence_representations = sentence_representations.view(
                batch_size * max_conversation_length,
                2 * self.params['hrnn_params']['sentence_encoder_hidden_size']
                + 1).index_select(0, sorted_idx)
            sentence_representations = sentence_representations[:
                                                                num_positive_lengths]
            h_posterior = torch.cat(
                (conversation_representations, sentence_representations), 1)
            for layer in self.posterior_hidden_layers:
                h_posterior = F.relu(layer(h_posterior))
            mu_posterior = self.mu_posterior(h_posterior)
            logvar_posterior = self.sigma_posterior(h_posterior)

            # In training, sample from the posterior distribution. At test time, sample from prior.
            mu, logvar = (mu_posterior,
                          logvar_posterior) if self.training else (
                              mu_prior, logvar_prior)
            z = self.reparametrize(mu, logvar)

            context = torch.cat((conversation_representations, z), 1)
        else:
            context = conversation_representations

        # Run decoder
        outputs = self.decoder(sorted_utterances,
                               sorted_lengths,
                               context,
                               movie_recommendations,
                               log_probabilities=True,
                               sample_movies=False)

        # Complete the missing sequences (of length 0)
        if num_positive_lengths < batch_size * max_conversation_length:
            tt = torch.cuda.FloatTensor if self.cuda_available else torch.FloatTensor
            pad_tensor = Variable(
                torch.zeros(batch_size * max_conversation_length -
                            num_positive_lengths,
                            max_utterance_length,
                            len(self.train_vocab) + self.n_movies,
                            out=tt()))
            outputs = torch.cat((outputs, pad_tensor), 0)

        # print("OUTPUT SHAPE :", outputs.data.shape) # (batch * max_conv_len, max_sentence_len, vocab + n_movie)
        # retrieve original order
        outputs = outputs.index_select(0, rev). \
            view(batch_size, max_conversation_length, max_utterance_length, -1)
        # print("OUTPUT SHAPE RETRIEVED IN ORDER", outputs.data.shape)
        # (batch, max_conv_len, max_sentence_len, vocab + n_movie)
        if return_latent:
            if self.params['latent_layer_sizes'] is None:
                raise ValueError(
                    "Model has no latent variable, cannot return latent parameters."
                )
            return outputs, mu_prior, logvar_prior, mu_posterior, logvar_posterior
        return outputs

    def train_iter(self, batch, criterion, kl_coefficient=1):
        self.train()
        if self.params['latent_layer_sizes'] is not None:
            outputs, mu_prior, logvar_prior, mu_posterior, logvar_posterior = self.forward(
                batch, return_latent=True)
        else:
            outputs = self.forward(batch, return_latent=False)

        batch_size, max_conv_length, max_seq_length, vocab_size = outputs.data.shape
        # indices of recommender's utterances(< batch * max_conv_len)
        idx = Variable(
            torch.nonzero((batch["senders"].view(-1) == -1).data).squeeze())
        # select recommender's utterances for the loss
        outputs = outputs.view(-1, max_seq_length,
                               vocab_size).index_select(0, idx)
        target = batch["target"].view(-1, max_seq_length).index_select(0, idx)

        loss = criterion(outputs.view(-1, vocab_size), target.view(-1))

        # variational loss = KL(posterior || prior)
        if self.params['latent_layer_sizes'] is not None:
            # for two normal distributions, kld(p1, p2) =
            # log(sig2 / sig1) + (sig1^2 + (mu1-mu2)^2) / (2 sig2^2) - 1/2
            # multivariate: (sig1 and sig2 the covariance matrices)
            # .5 * (tr(sig2^-1 sig1) + (mu2-mu1)T sig2^-1 (mu2-mu1) - k + ln(det(sig2) / det(sig1))
            # in the case where sig1 and sig2 are diagonal:
            # .5 * sum(sig1^2 / sig2^2 + (mu2-mu1)^2 / sig2^2 - 1 + ln(sig2^2) - ln(sig1^2))
            kld = .5 * (
                -1 + logvar_prior - logvar_posterior +
                (torch.exp(logvar_posterior) +
                 (mu_posterior - mu_prior).pow(2)) / torch.exp(logvar_prior))
            kld = torch.mean(torch.sum(kld, -1))
            # print("NLL loss {} KLD {}".format(loss.data, kld.data))
            loss += kl_coefficient + kld
        # backward pass
        loss.backward()
        return loss.data[0]

    def evaluate(self, batch_loader, criterion, subset="valid"):
        """
        Evaluate function
        :param subset: in {"valid", "train"}. Susbet on which to evaluate
        :return: the mean loss.
        """
        self.eval()
        batch_loader.batch_index[subset] = 0
        n_batches = batch_loader.n_batches[subset]

        losses = []
        for _ in tqdm(range(n_batches)):
            # load batch
            batch = batch_loader.load_batch(subset=subset)
            if self.cuda_available:
                batch["dialogue"] = batch["dialogue"].cuda()
                batch["target"] = batch["target"].cuda()
                batch["senders"] = batch["senders"].cuda()
            # compute output and loss
            outputs = self.forward(batch)

            batch_size, max_conv_length, max_seq_length, vocab_size = outputs.data.shape
            # indices of recommender's utterances(< batch * max_conv_len)
            idx = Variable(
                torch.nonzero(
                    (batch["senders"].view(-1) == -1).data).squeeze())
            # select recommender's utterances for the loss
            outputs = outputs.view(-1, max_seq_length,
                                   vocab_size).index_select(0, idx)
            target = batch["target"].view(-1,
                                          max_seq_length).index_select(0, idx)

            loss = criterion(outputs.view(-1, vocab_size), target.view(-1))
            losses.append(loss.data[0])
        print("{} loss : {}".format(subset, np.mean(losses)))
        self.train()
        return np.mean(losses)
예제 #17
0
"""

import sys
import json
import h5py
import numpy as np

DATA_PATH = '/hdd/robik/CLEVR'
GENSEN_PATH = '/hdd/robik/projects/gensen'
sys.path.append(f'{GENSEN_PATH}')

from gensen import GenSen, GenSenSingle

gensen_1 = GenSenSingle(
    model_folder=f'{GENSEN_PATH}/data/models',
    filename_prefix='nli_large_bothskip',
    cuda=True,
    pretrained_emb=f'{GENSEN_PATH}/data/embedding/glove.840B.300d.h5')

for split in ['train', 'val']:
    feat_h5 = h5py.File(f'{DATA_PATH}/questions_{split}_clevr.h5', 'w')
    ques = json.load(
        open(f'{DATA_PATH}/questions/CLEVR_{split}_questions.json'))
    ques = ques['questions']
    questions = [q['question'] for q in ques]
    qids = [q['question_index'] for q in ques]
    qids = np.int64(qids)
    dt = h5py.special_dtype(vlen=str)
    feat_h5.create_dataset('feats', (len(qids), 2048), dtype=np.float32)
    feat_h5.create_dataset('qids', (len(qids), ), dtype=np.int64)
    feat_h5.create_dataset('questions', (len(qids), ), dtype=dt)
예제 #18
0
        help="Path to pretrained embeddings",
        default=
        '/data/milatmp1/subramas/embeddings/new_glove.840B.300d.h5',  # (Don't mess with this)
    )
    args = parser.parse_args()

    batch_size = 20000
    hidden_size = 2048
    max_length = 100
    data_file = args.train_filename

    iterator = SentenceIterator(data_file,
                                vocab_size=80000,
                                max_length=max_length)
    model = GenSenSingle(model_folder=args.folder_path,
                         filename_prefix=args.prefix,
                         pretrained_emb=args.pretrain,
                         cuda=True)
    iterator.word2id = model.word2id
    iterator.id2word = model.id2word
    model.vocab_expansion(model.id2word.values())
    sentences = iterator.lines if batch_size is 'all' else iterator.lines[
        0:batch_size]
    sentences = [' '.join(s[:max_length]) for s in sentences]
    repr_last_h = np.empty((0, hidden_size))
    for mbatch_idx, mbatch in enumerate(range(0, len(sentences), 200)):
        less_sentences = sentences[mbatch:mbatch + 200]
        _, last_h = model.get_representation(less_sentences,
                                             pool='last',
                                             return_numpy=True,
                                             tokenize=False)
        repr_last_h = np.append(repr_last_h, last_h, axis=0)