示例#1
0
class UniversalSentenceEncoder:
    def __init__(self):
        super().__init__()
        model_version = 1
        MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version
        W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'

        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': model_version
        }
        self.model = InferSent(params_model)
        self.model.load_state_dict(torch.load(MODEL_PATH))
        self.model.eval()
        use_cuda = True
        self.model = self.model.cuda() if use_cuda else self.model
        self.model.set_w2v_path(W2V_PATH)
        self.model.build_vocab_k_words(K=100000)

    def semantic_sim(self, sents1, sents2):
        embed1 = self.model.encode(sents1, tokenize=False)
        embed2 = self.model.encode(sents2, tokenize=False)
        embed1 = torch.tensor(embed1)
        embed2 = torch.tensor(embed2)
        sts_encode1 = embed1 / torch.norm(embed1, p=2, dim=1, keepdim=True)
        sts_encode2 = embed2 / torch.norm(embed2, p=2, dim=1, keepdim=True)
        cosine_similarities = torch.sum(sts_encode1 * sts_encode2, dim=1)
        clip_cosine_similarities = torch.clamp(cosine_similarities, -1.0, 1.0)
        scores = 1.0 - torch.acos(clip_cosine_similarities)
        return scores.cpu().numpy()
示例#2
0
    def compute_intent_vectors(self, sentences):
        # TODO IMPLEMENT CACHING!
        from InferSent.models import InferSent
        infersent_folder = Path('./InferSent')
        infersent_path = Path(infersent_folder / 'encoder' / 'infersent1.pkl')
        MODEL_PARAMETERS = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        W2V_PATH = infersent_folder / 'GloVe' / 'glove.840B.300d.txt'

        model = InferSent(MODEL_PARAMETERS)
        model.load_state_dict(torch.load(infersent_path))
        if torch.cuda.is_available():
            model.cuda()
        model.set_w2v_path(W2V_PATH)
        model.build_vocab_k_words(K=100000)

        utterances_dict = self.get_utterances_dict(sentences)

        vectors = {}
        for i, (intent, sentences) in enumerate(utterances_dict.items()):
            LOGGER.info('{}/{} done'.format(i + 1,
                                            len(utterances_dict.items())))
            embeddings = model.encode(sentences)
            avg_embedding = np.mean(embeddings, axis=0)
            vectors[intent] = avg_embedding

        return vectors
示例#3
0
def _load_pretrained_model(verbose=True):
    if verbose:
        print(f">>> Loading pretrained model from {_MODEL_PATH}")
    infersent = InferSent(_PARAMS_MODEL)
    infersent.load_state_dict(torch.load(_MODEL_PATH))
    infersent.set_w2v_path(_W2V_PATH)
    infersent.build_vocab_k_words(K=_K_WORDS_VOCAB)
    return infersent
    def __init__(self):
        #print("Initializing Infersent..")
        model_version = 1
        MODEL_PATH = get_project_root() / Path("encoder/infersent%s.pkl" % model_version)
        params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                        'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
        model = InferSent(params_model)
        model.load_state_dict(torch.load(MODEL_PATH))

        # word vector path for the model:
        W2V_PATH = get_project_root() / Path('GloVe/glove.840B.300d.txt' if model_version == 1 else '../fastText/crawl-300d-2M.vec')
        model.set_w2v_path(W2V_PATH)

        # build the vocabulary of word vectors
        model.build_vocab_k_words(K=100000)

        self.model = model
示例#5
0
def Start_chatbot():
    model_version = 1
    MODEL_PATH = "../InferSent/encoder/infersent%s.pkl" % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))

    use_cuda = False
    model = model.cuda() if use_cuda else model

    W2V_PATH = '../data/glove.6B.300d.txt' if model_version == 1 else '../dataset/fastText/crawl-300d-2M.vec'
    model.set_w2v_path(W2V_PATH)

    model.build_vocab_k_words(K=570000)

    dict = {}
    embeddings = {}
    questions = []
    answers = []

    with open('../data/questions.txt') as f:
        content = f.readlines()
    questions = [x.strip() for x in content]

    with open('../data/answers.txt') as f:
        content = f.readlines()
    answers = [x.strip() for x in content]

    for i in range(len(questions)):
        dict[questions[i]] = answers[i]
        embeddings[questions[i]] = model.encode([questions[i]])[0]

    return model, dict, embeddings
示例#6
0
def process(channel):
    # Load the Classifier
    tf.reset_default_graph()
    NN = classifer()
    NN.load('nn-classifier-v2')

    # Load the sentence embedder
    model_version = 1
    MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    # Keep it on CPU or put it on GPU
    use_cuda = False
    model = model.cuda() if use_cuda else model

    # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
    W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'InferSent/fastText/crawl-300d-2M.vec'
    model.set_w2v_path(W2V_PATH)
    # Load embeddings of K most frequent words
    model.build_vocab_k_words(K=100000)

    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    all_files = glob.glob('../files/CableNews/%s/*.p' % channel)
    read_files = pickle.load(open('%s_visit.p' % (channel), 'rb'))
    counter = len(read_files)

    for file in tqdm(all_files):
        if file in read_files:
            continue
        else:
            read_files.append(file)
            if np.random.rand() < 0.3:
                pickle.dump(read_files, open('%s_visit.p' % (channel), 'wb'))

        res = pickle.load(open(file, 'rb'))
        results = {}
        prev_text = ""
        all_text = []
        all_keys = []
        for key in res.keys():
            meta_data = res[key][0]  # First in the list
            if len(meta_data['text']) < 10:
                continue

            # Make sure we drop the duplicates: Texts should be differents
            current_text = meta_data['text'][:10]
            if current_text == prev_text:
                continue
            else:
                prev_text = current_text

            text = tokenizer.tokenize(meta_data['text'])
            if len(text) <= 2:
                continue
            # Drop the first sentence
            text = text[1:]
            senteces = []
            for s in text:  #Drop super small and super large senteces
                if len(s.split()) > 30 and len(s.split()) < 50:
                    senteces.append(s)
            if len(senteces) == 0:
                continue
            # Calculate the embedding
            all_text.extend(senteces)
            all_keys.extend([key] * len(senteces))
        if len(all_text) == 0:
            continue
        all_embed = model.encode(all_text,
                                 bsize=128,
                                 tokenize=True,
                                 verbose=False)
        all_predictions = NN.predict(all_embed)[
            0]  # Merge the probabilties and take top 2:
        prev_key = None
        total_prob = np.zeros((13, 1))
        key_counter = 0
        for current_key in all_keys:
            if current_key == prev_key:
                total_prob[:, 0] += all_predictions[key_counter, :]
            else:
                Topics = Ticks[np.flip(np.argsort(total_prob[:, 0])[-2:])]
                Probs = np.flip(np.sort(total_prob[:, 0])[-2:]) * 100
                results[current_key] = {
                    'Topics': list(Topics),
                    'Probs': list(Probs),
                    'gender': res[current_key][0]['gender'],
                    'persons': res[current_key][0]['persons'],
                    'locations': res[current_key][0]['locations']
                }
                prev_key = current_key
                total_prob = np.zeros((13, 1))
                total_prob[:, 0] += all_predictions[key_counter, :]
            key_counter += 1
        pickle.dump(results,
                    open('processed_data/%s/%d.p' % (channel, counter), 'wb'))
        counter += 1
model_pkl = '../InferSent/encoder/infersent1.pkl'
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': 1
}
infer_sent_model = InferSent(params_model)
infer_sent_model.load_state_dict(torch.load(model_pkl))

# In[111]:

infer_sent_model.set_w2v_path(glove_w2v_loc)
infer_sent_model.build_vocab_k_words(K=100000)

# infer_sent_model.to(torch.device("cuda:0"))

# In[112]:

infer_sent_model.encode(["This man is playing computer games"], tokenize=True)

# In[113]:


def get_embedding_for_context(ctx):
    if not isinstance(ctx, list):
        #       print("ctx is not list")
        ctx = [ctx]
    return infer_sent_model.encode(ctx, tokenize=True)
示例#8
0
    'version': 1
}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

# CUDA
use_cuda = True
model = model.cuda() if use_cuda else model

# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = 'GloVe/glove.840B.300d.txt'
model.set_w2v_path(W2V_PATH)

# Load embeddings of K most frequent words
print('Load glove')
model.build_vocab_k_words(K=2000000)

# load sentence
dis = '/home/shl183/nlp4note/classified_txt/discharge-sep'
res = '/home/shl183/nlp4note/infersent'
patients = os.listdir(dis)
# exist = os.listdir(res)
# with open('./tmp.pkl','wb') as f:
#     pickle.dump(exist,f)
for patient in patients:
    # if patient in exist:
    #     continue
    notes = os.listdir('{}/{}'.format(dis, patient))
    for note in notes:
        tps = os.listdir('{}/{}/{}'.format(dis, patient, note))
        if os.path.exists('{}/{}/{}'.format(res, patient, note)):
示例#9
0
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': VERSION
}
if VERSION == 1:
    W2V = 'C:/users/georg/Desktop/GloVe/glove.840B.300d.txt'
else:
    W2V = 'C:/Users/georg/Desktop/fastText/crawl-300d-2M.vec'
VOCAB_SIZE = 100000
NUM_STEPS = 300

# set up model
model = InferSent(PARAMS).to(DEVICE)
model.load_state_dict(torch.load(WEIGHTS))
model.set_w2v_path(W2V)
word2vec = model.build_vocab_k_words(K=VOCAB_SIZE)

# setup the NN-classifer
vec2word = KNeighborsClassifier(n_neighbors=1)
vecs = []
words = []
for key, val in word2vec.items():
    if val.shape == (300, ):
        vecs.append(val)
        words.append(key)
X = np.vstack(vecs)
y = np.array(words)
vec2word.fit(X, y)


# NOTE: stacked word vectors are len x 1 x vec dim
示例#10
0
文件: draft.py 项目: liangeric/nlpQ-A
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = 'fastText/crawl-300d-2M.vec'
    infersent.set_w2v_path(W2V_PATH)

    # infersent.build_vocab(sentences, tokenize=True)
    infersent.build_vocab_k_words(K=100000)

    embeddings = infersent.encode(sentences,
                                  bsize=128,
                                  tokenize=False,
                                  verbose=True)
    print('nb sentences encoded : {0}'.format(len(embeddings)))
    #### End Paste

    parsedQs = []
    with open(questions, "r+") as f:
        for q in f.readlines():
            parsedQs.append(preprocessQs(q))
    # print(parsedQs)s

    qEmbeddings = infersent.encode(parsedQs,