示例#1
0
def get_gensen_synset_definitions(entity_file, vocab_file, gensen_file):
    from gensen import GenSen, GenSenSingle

    gensen_1 = GenSenSingle(
        model_folder='./data/models',
        filename_prefix='nli_large_bothskip',
        pretrained_emb='./data/embedding/glove.840B.300d.h5')
    gensen_1.eval()

    definitions = {}
    with open(entity_file, 'r') as fin:
        for line in fin:
            node = json.loads(line)
            if node['type'] == 'synset':
                definitions[node['id']] = node['definition']

    with open(vocab_file, 'r') as fin:
        vocab_list = fin.read().strip().split('\n')

    # get the descriptions
    sentences = [''] * NUM_EMBEDDINGS
    for k, entity in enumerate(vocab_list):
        definition = definitions.get(entity)
        if definition is None:
            assert entity in ('@@UNKNOWN@@', '@@MASK@@', '@@NULL@@')
        else:
            sentences[k + 1] = definition

    embeddings = np.zeros((NUM_EMBEDDINGS, 2048), dtype=np.float32)
    for k in range(0, NUM_EMBEDDINGS, 32):
        sents = sentences[k:(k + 32)]
        reps_h, reps_h_t = gensen_1.get_representation(sents,
                                                       pool='last',
                                                       return_numpy=True,
                                                       tokenize=True)
        embeddings[k:(k + 32), :] = reps_h_t
        print(k)

    with h5py.File(gensen_file, 'w') as fout:
        ds = fout.create_dataset('gensen', data=embeddings)
示例#2
0
    feat_h5 = h5py.File(f'{DATA_PATH}/questions_{split}_clevr.h5', 'w')
    ques = json.load(
        open(f'{DATA_PATH}/questions/CLEVR_{split}_questions.json'))
    ques = ques['questions']
    questions = [q['question'] for q in ques]
    qids = [q['question_index'] for q in ques]
    qids = np.int64(qids)
    dt = h5py.special_dtype(vlen=str)
    feat_h5.create_dataset('feats', (len(qids), 2048), dtype=np.float32)
    feat_h5.create_dataset('qids', (len(qids), ), dtype=np.int64)
    feat_h5.create_dataset('questions', (len(qids), ), dtype=dt)
    feat_h5['qids'][:] = qids
    feat_h5['questions'][:] = questions

    chunksize = 5000
    question_chunks = [
        questions[x:x + chunksize] for x in range(0, len(questions), chunksize)
    ]

    done = 0
    for qchunk in question_chunks:
        print(done)
        _, reps_h_t = gensen_1.get_representation(qchunk,
                                                  pool='last',
                                                  return_numpy=True,
                                                  tokenize=True)
        feat_h5['feats'][done:done + len(qchunk)] = reps_h_t
        done += len(qchunk)

    feat_h5.close()
示例#3
0
obama = [
    'Condolences to the family of John Singleton. His seminal work, Boyz n the Hood, remains one of the most searing, loving portrayals of the challenges facing inner-city youth. He opened doors for filmmakers of color to tell powerful stories that have been too often ignored.',
    'This generation of climate activists is tired of inaction, and theyve caught the attention of leaders all over the world. So while this challenge is only getting more urgent, they show us the kind of action itll take to meet this moment.',
    'That we are in the midst of crisis is now well understood. Our nation is at war, against a far-reaching network of violence and hatred. Our economy is badly weakened, a consequence of greed and irresponsibility on the part of some, but also our collective failure to make hard choices and prepare the nation for a new age. Homes have been lost; jobs shed; businesses shuttered. Our health care is too costly; our schools fail too many; and each day brings further evidence that the ways we use energy strengthen our adversaries and threaten our planet.'
]
idx2speaker = [
    'trump1', 'trump2', 'trumpinaguration', 'obama1', 'obama2',
    'obamainaguration', 'shakespeare1', 'shakespeare2', 'wutang1', 'wutang2',
    'lukecombs', 'lukecombs'
]
sentences = trump + obama + shakespeare + lukecombs + wutang
gensen_1 = GenSenSingle(model_folder='./data/models',
                        filename_prefix='nli_large_bothskip',
                        pretrained_emb='./data/embedding/glove.840B.300d.h5')
reps_h, reps_h_t = gensen_1.get_representation(sentences,
                                               pool='last',
                                               return_numpy=True,
                                               tokenize=True)
x = []
for i in range(len(reps_h)):
    x.append(reps_h[i].mean(axis=0))

model = TSNE(n_components=2,
             perplexity=20,
             init='pca',
             method='exact',
             n_iter=5000)
x = model.fit_transform(x)
pyplot.figure(figsize=(20, 20))
pyplot.xlim((np.min(x[:, 0]) - 10, np.max(x[:, 0]) + 10))
pyplot.ylim((np.min(x[:, 1]) - 10, np.max(x[:, 1]) + 10))
for i in range(len(x)):
class SemanticAnalyser(object):
    """Class for comparing sentences for entailment
    """
    def __init__(self):
        """Initalizes object
        """
        self.__encoder = GenSenSingle(
            model_folder=os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'models'),
            filename_prefix='nli_large',
            pretrained_emb=os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'embedding', 'glove.840B.300d.h5')
        )

        with open(os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'models', 'senteval.pickle'), 'rb') as file:
            self.__evaluator = pickle.load(file)

        self.__mutex = Lock()

    def get_entailments_with_levels(self, sentence, sentences):
        """Analyzes relation between a sentence and all in a collection

        Args:
            sentence: a sentence
            sentences: a non-empty list of sentences

        Returns:
            entailment:
                0 if entailed, 1 if neutral, 2 if contradicting for each element in sentences
            level:
                a non-negative value of how much this sentence is entailed with each element in sentences
        """
        self.__mutex.acquire()
        _, encoded = self.__encoder.get_representation([sentence] + sentences, pool='last', return_numpy=True, tokenize=True)
        input = np.concatenate((
            np.repeat([encoded[0]], len(sentences), axis=0),
            encoded[1:],
            (np.repeat([encoded[0]], len(sentences), axis=0)) * encoded[1:]), axis=1)
        output = self.__model_predict(input)
        self.__mutex.release()

        entailment = np.argmax(output, axis=1)

        level = np.max(output, axis=1) - np.transpose(output)[1]

        for i, sent in enumerate(sentences):
            if sentence == sent:
                entailment[i] = 0
                level[i] = 1e10

        return entailment, level

    def get_entailment(self, sentence1, sentence2):
        """Analyzes relation between two sentences

        Args:
            sentence1: first sentence as a string
            sentence2: second sentence as a string

        Returns:
            0 if entailed, 1 if neutral, 2 if contradicting
        """

        if sentence1 == sentence2:
            return 0

        self.__mutex.acquire()
        _, encoded = self.__encoder.get_representation([sentence1, sentence2], pool='last', return_numpy=True, tokenize=True)
        input = np.concatenate((encoded[0], encoded[1], encoded[0] * encoded[1]))
        output = self.__model_predict(np.array([input]))
        self.__mutex.release()

        return np.argmax(output)

    def __model_predict(self, input):
        sentence_size = input.shape[1] // 3
        batch_size = input.shape[0]
        switched_input = np.hstack((input[:, sentence_size:2*sentence_size], input[:, 0:sentence_size], input[:, 2*sentence_size:3*sentence_size]))

        input = np.vstack((input, switched_input))
        
        self.__evaluator.model.eval()
        input = torch.FloatTensor(input).cuda()
        yhat = []
        with torch.no_grad():
            for i in range(0, len(input), self.__evaluator.batch_size):
                x = input[i:i + self.__evaluator.batch_size]
                output = self.__evaluator.model(x)
                yhat.append(output.data.cpu().numpy())
        yhat = np.vstack(yhat)
        yhat = (yhat[0:batch_size, :] + yhat[batch_size:2*batch_size, :]) / 2
        return yhat
示例#5
0
                                max_length=max_length)
    model = GenSenSingle(model_folder=args.folder_path,
                         filename_prefix=args.prefix,
                         pretrained_emb=args.pretrain,
                         cuda=True)
    iterator.word2id = model.word2id
    iterator.id2word = model.id2word
    model.vocab_expansion(model.id2word.values())
    sentences = iterator.lines if batch_size is 'all' else iterator.lines[
        0:batch_size]
    sentences = [' '.join(s[:max_length]) for s in sentences]
    repr_last_h = np.empty((0, hidden_size))
    for mbatch_idx, mbatch in enumerate(range(0, len(sentences), 200)):
        less_sentences = sentences[mbatch:mbatch + 200]
        _, last_h = model.get_representation(less_sentences,
                                             pool='last',
                                             return_numpy=True,
                                             tokenize=False)
        repr_last_h = np.append(repr_last_h, last_h, axis=0)
    print(repr_last_h.shape)
    iterator.build_kde(repr_last_h=repr_last_h,
                       num_dim_pca=40,
                       grid_search_num=7)
    data_gen = iterator.sample_kde(batch_size=10, cuda=True)
    print(data_gen['input'].shape)
    iterator.save_kde(file_name_kde="kde.sav", file_name_pca="pca.sav")
    iterator.load_kde(file_name_kde="kde.sav", file_name_pca="pca.sav")
    data_gen = iterator.sample_kde(batch_size=10, cuda=False)
    print(data_gen['input'].shape)
    total_time = time.time() - start
    print(total_time)