Python GenSenSingle.get_representation 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: gensen

클래스/타입: GenSenSingle

메소드/함수: get_representation

hotexamples.com에서의 예제들: 5

Python GenSenSingle.get_representation - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 gensen.GenSenSingle.get_representation에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

GenSenSingle(13)

get_representation(5)

vocab_expansion(3)

eval(1)

get_representation_and_embedded_input(1)

예제 #1

파일 보기

def get_gensen_synset_definitions(entity_file, vocab_file, gensen_file):
    from gensen import GenSen, GenSenSingle

    gensen_1 = GenSenSingle(
        model_folder='./data/models',
        filename_prefix='nli_large_bothskip',
        pretrained_emb='./data/embedding/glove.840B.300d.h5')
    gensen_1.eval()

    definitions = {}
    with open(entity_file, 'r') as fin:
        for line in fin:
            node = json.loads(line)
            if node['type'] == 'synset':
                definitions[node['id']] = node['definition']

    with open(vocab_file, 'r') as fin:
        vocab_list = fin.read().strip().split('\n')

    # get the descriptions
    sentences = [''] * NUM_EMBEDDINGS
    for k, entity in enumerate(vocab_list):
        definition = definitions.get(entity)
        if definition is None:
            assert entity in ('@@UNKNOWN@@', '@@MASK@@', '@@NULL@@')
        else:
            sentences[k + 1] = definition

    embeddings = np.zeros((NUM_EMBEDDINGS, 2048), dtype=np.float32)
    for k in range(0, NUM_EMBEDDINGS, 32):
        sents = sentences[k:(k + 32)]
        reps_h, reps_h_t = gensen_1.get_representation(sents,
                                                       pool='last',
                                                       return_numpy=True,
                                                       tokenize=True)
        embeddings[k:(k + 32), :] = reps_h_t
        print(k)

    with h5py.File(gensen_file, 'w') as fout:
        ds = fout.create_dataset('gensen', data=embeddings)

예제 #2

파일 보기

    feat_h5 = h5py.File(f'{DATA_PATH}/questions_{split}_clevr.h5', 'w')
    ques = json.load(
        open(f'{DATA_PATH}/questions/CLEVR_{split}_questions.json'))
    ques = ques['questions']
    questions = [q['question'] for q in ques]
    qids = [q['question_index'] for q in ques]
    qids = np.int64(qids)
    dt = h5py.special_dtype(vlen=str)
    feat_h5.create_dataset('feats', (len(qids), 2048), dtype=np.float32)
    feat_h5.create_dataset('qids', (len(qids), ), dtype=np.int64)
    feat_h5.create_dataset('questions', (len(qids), ), dtype=dt)
    feat_h5['qids'][:] = qids
    feat_h5['questions'][:] = questions

    chunksize = 5000
    question_chunks = [
        questions[x:x + chunksize] for x in range(0, len(questions), chunksize)
    ]

    done = 0
    for qchunk in question_chunks:
        print(done)
        _, reps_h_t = gensen_1.get_representation(qchunk,
                                                  pool='last',
                                                  return_numpy=True,
                                                  tokenize=True)
        feat_h5['feats'][done:done + len(qchunk)] = reps_h_t
        done += len(qchunk)

    feat_h5.close()

예제 #3

파일 보기

obama = [
    'Condolences to the family of John Singleton. His seminal work, Boyz n the Hood, remains one of the most searing, loving portrayals of the challenges facing inner-city youth. He opened doors for filmmakers of color to tell powerful stories that have been too often ignored.',
    'This generation of climate activists is tired of inaction, and theyve caught the attention of leaders all over the world. So while this challenge is only getting more urgent, they show us the kind of action itll take to meet this moment.',
    'That we are in the midst of crisis is now well understood. Our nation is at war, against a far-reaching network of violence and hatred. Our economy is badly weakened, a consequence of greed and irresponsibility on the part of some, but also our collective failure to make hard choices and prepare the nation for a new age. Homes have been lost; jobs shed; businesses shuttered. Our health care is too costly; our schools fail too many; and each day brings further evidence that the ways we use energy strengthen our adversaries and threaten our planet.'
]
idx2speaker = [
    'trump1', 'trump2', 'trumpinaguration', 'obama1', 'obama2',
    'obamainaguration', 'shakespeare1', 'shakespeare2', 'wutang1', 'wutang2',
    'lukecombs', 'lukecombs'
]
sentences = trump + obama + shakespeare + lukecombs + wutang
gensen_1 = GenSenSingle(model_folder='./data/models',
                        filename_prefix='nli_large_bothskip',
                        pretrained_emb='./data/embedding/glove.840B.300d.h5')
reps_h, reps_h_t = gensen_1.get_representation(sentences,
                                               pool='last',
                                               return_numpy=True,
                                               tokenize=True)
x = []
for i in range(len(reps_h)):
    x.append(reps_h[i].mean(axis=0))

model = TSNE(n_components=2,
             perplexity=20,
             init='pca',
             method='exact',
             n_iter=5000)
x = model.fit_transform(x)
pyplot.figure(figsize=(20, 20))
pyplot.xlim((np.min(x[:, 0]) - 10, np.max(x[:, 0]) + 10))
pyplot.ylim((np.min(x[:, 1]) - 10, np.max(x[:, 1]) + 10))
for i in range(len(x)):

예제 #4

파일 보기

파일: SemanticAnalyser.py 프로젝트: k-checinski/Gossips-spreading-simulator

class SemanticAnalyser(object):
    """Class for comparing sentences for entailment
    """
    def __init__(self):
        """Initalizes object
        """
        self.__encoder = GenSenSingle(
            model_folder=os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'models'),
            filename_prefix='nli_large',
            pretrained_emb=os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'embedding', 'glove.840B.300d.h5')
        )

        with open(os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'models', 'senteval.pickle'), 'rb') as file:
            self.__evaluator = pickle.load(file)

        self.__mutex = Lock()

    def get_entailments_with_levels(self, sentence, sentences):
        """Analyzes relation between a sentence and all in a collection

        Args:
            sentence: a sentence
            sentences: a non-empty list of sentences

        Returns:
            entailment:
                0 if entailed, 1 if neutral, 2 if contradicting for each element in sentences
            level:
                a non-negative value of how much this sentence is entailed with each element in sentences
        """
        self.__mutex.acquire()
        _, encoded = self.__encoder.get_representation([sentence] + sentences, pool='last', return_numpy=True, tokenize=True)
        input = np.concatenate((
            np.repeat([encoded[0]], len(sentences), axis=0),
            encoded[1:],
            (np.repeat([encoded[0]], len(sentences), axis=0)) * encoded[1:]), axis=1)
        output = self.__model_predict(input)
        self.__mutex.release()

        entailment = np.argmax(output, axis=1)

        level = np.max(output, axis=1) - np.transpose(output)[1]

        for i, sent in enumerate(sentences):
            if sentence == sent:
                entailment[i] = 0
                level[i] = 1e10

        return entailment, level

    def get_entailment(self, sentence1, sentence2):
        """Analyzes relation between two sentences

        Args:
            sentence1: first sentence as a string
            sentence2: second sentence as a string

        Returns:
            0 if entailed, 1 if neutral, 2 if contradicting
        """

        if sentence1 == sentence2:
            return 0

        self.__mutex.acquire()
        _, encoded = self.__encoder.get_representation([sentence1, sentence2], pool='last', return_numpy=True, tokenize=True)
        input = np.concatenate((encoded[0], encoded[1], encoded[0] * encoded[1]))
        output = self.__model_predict(np.array([input]))
        self.__mutex.release()

        return np.argmax(output)

    def __model_predict(self, input):
        sentence_size = input.shape[1] // 3
        batch_size = input.shape[0]
        switched_input = np.hstack((input[:, sentence_size:2*sentence_size], input[:, 0:sentence_size], input[:, 2*sentence_size:3*sentence_size]))

        input = np.vstack((input, switched_input))
        
        self.__evaluator.model.eval()
        input = torch.FloatTensor(input).cuda()
        yhat = []
        with torch.no_grad():
            for i in range(0, len(input), self.__evaluator.batch_size):
                x = input[i:i + self.__evaluator.batch_size]
                output = self.__evaluator.model(x)
                yhat.append(output.data.cpu().numpy())
        yhat = np.vstack(yhat)
        yhat = (yhat[0:batch_size, :] + yhat[batch_size:2*batch_size, :]) / 2
        return yhat

예제 #5

파일 보기

파일: utils.py 프로젝트: jpilaul/pointer-generator

                                max_length=max_length)
    model = GenSenSingle(model_folder=args.folder_path,
                         filename_prefix=args.prefix,
                         pretrained_emb=args.pretrain,
                         cuda=True)
    iterator.word2id = model.word2id
    iterator.id2word = model.id2word
    model.vocab_expansion(model.id2word.values())
    sentences = iterator.lines if batch_size is 'all' else iterator.lines[
        0:batch_size]
    sentences = [' '.join(s[:max_length]) for s in sentences]
    repr_last_h = np.empty((0, hidden_size))
    for mbatch_idx, mbatch in enumerate(range(0, len(sentences), 200)):
        less_sentences = sentences[mbatch:mbatch + 200]
        _, last_h = model.get_representation(less_sentences,
                                             pool='last',
                                             return_numpy=True,
                                             tokenize=False)
        repr_last_h = np.append(repr_last_h, last_h, axis=0)
    print(repr_last_h.shape)
    iterator.build_kde(repr_last_h=repr_last_h,
                       num_dim_pca=40,
                       grid_search_num=7)
    data_gen = iterator.sample_kde(batch_size=10, cuda=True)
    print(data_gen['input'].shape)
    iterator.save_kde(file_name_kde="kde.sav", file_name_pca="pca.sav")
    iterator.load_kde(file_name_kde="kde.sav", file_name_pca="pca.sav")
    data_gen = iterator.sample_kde(batch_size=10, cuda=False)
    print(data_gen['input'].shape)
    total_time = time.time() - start
    print(total_time)