示例#1
0
    def encode_article(self):
        """
        Method to embed sentences into sentence vectors.
        :return: List of encoded sentences
        """
        sentences_to_encode = []
        encoded_article = [None]
        sentence_list = [0]
        sent_count = 0

        for sentence in self.article:
            sent_count += len(sentence)
            sentence_list.append(sent_count)
            for sent in sentence:
                sentences_to_encode.append(sent)  # Split into sentences

        print('Loading models...'),
        model = skipthoughts.LoadModel().load_model()
        encoder = skipthoughts.Encoder(model)
        print('Encoding sentences...'),
        encoded_sentences = encoder.encode(sentences_to_encode, verbose=False)
        print('Done')

        for i in range(len(self.article)):
            begin = sentence_list[i]
            end = sentence_list[i + 1]
            encoded_article[i] = encoded_sentences[begin:end]
        return encoded_article
示例#2
0
def load_inputs(path, limit=-1):
    '''
	Returns captions,resnet_embeddings,vectors
	'''

    # load captions and resnet embeddings
    split = pickle.load(open(path, "rb"))
    ret = None
    if limit != -1:
        split = split[:limit]

    if len(split[0]) == 2:
        # encode captions via skipthought, if there are not emebddings already include
        model = skipthoughts.load_model()
        encoder = skipthoughts.Encoder(model)
        captions, resnet_embeddings = [s[0] for s in split
                                       ], [s[1].flatten() for s in split]
        vectors = encoder.encode(captions)
        ret = captions, resnet_embeddings, vectors
    elif len(split[0]) == 3:
        captions, resnet_embeddings, vectors = [s[0] for s in split], [
            s[1].flatten() for s in split
        ], [s[2].flatten() for s in split]
        vectors = np.array(vectors)
        ret = captions, resnet_embeddings, vectors
    else:
        raise "Input pickled vectors should be a list of two tuples (caption,resnet) or\
		three-tuples (caption,resnet,embedding)"

    return ret
示例#3
0
def skipthought_encode(sentences):
    """
    Obtains sentence embeddings for each sentence in the text
    """
    # print('Loading pre-trained models...')
    model = skipthoughts.load_model()
    encoder = skipthoughts.Encoder(model)
    # print('Encoding sentences...')
    encoded = encoder.encode(sentences)
    return encoded
示例#4
0
def load_input(path):
    input_images = load_input_images(path)
    input_sentences, labels = load_input_sentences()

    inputs = list(zip(input_images, input_sentences, labels))
    random.shuffle(inputs)
    input_images, input_sentences, labels = zip(*inputs)

    model = skipthoughts.load_model()
    encoder = skipthoughts.Encoder(model)
    input_sentences = encoder.encode(input_sentences)

    return input_images, input_sentences, labels
示例#5
0
def transform_sentences(sentences):
    """
    Builds sentence embeddings using the Skip-thoughts model.

    :param _df: Input data frame with column of sentences.
    :return: Torch matrix of embeddings, size 1024.
    """
    model = skipthoughts.load_model()
    encoder = skipthoughts.Encoder(model)
    #sentences = _df['sentText'].tolist()
    _sent_embs = encoder.encode(sentences, verbose=True)
    _sent_tensors = [torch.from_numpy(j) for j in _sent_embs]
    return torch.stack(_sent_tensors)
示例#6
0
def encodeSentences(reviews):
    enc_reviews = [None] * len(reviews)
    cum_sum_sentences = [0]
    sent_count = 0
    for review in reviews:
        sent_count += len(review)
        cum_sum_sentences.append(sent_count)

    print('Loading pretrained model...')
    model = skipthoughts.load_model()
    encoder = skipthoughts.Encoder(model)
    print('Encoding sentences...')
    enc_sentences = encoder.encode(reviews, verbose=False)
    return enc_sentences
示例#7
0
def get_pretrained_encodings(pretrained=False):
    '''
    Get encodings using the pre-trained models.
    '''

    word_set = set()
    dict_f = open(os.path.join(DATA_PATH, 'word2vec/dict.txt'), 'r')
    for line in dict_f:
        word_set.add(line.strip())
    dict_f.close()

    # Getting the data.
    with open(os.path.join(DATA_PATH, 'amazon_food/train_data.pkl'), 'r') as f:
        train_data = cPickle.load(f)
        train_preprocessed = preprocess(train_data[0], word_set)
    with open(os.path.join(DATA_PATH, 'amazon_food/test_data.pkl'), 'r') as f:
        test_data = cPickle.load(f)
        test_preprocessed = preprocess(test_data[0], word_set)

    if pretrained:
        model = skipthoughts.load_model()
        encoder = skipthoughts.Encoder(model)
        test_save_path = os.path.join(
            DATA_PATH,
            'amazon_food/skip_thought_vecs/skip_thought_vecs_test_pretrained.npy'
        )
        train_save_path = os.path.join(
            DATA_PATH,
            'amazon_food/skip_thought_vecs/skip_thought_vecs_train_pretrained.npy'
        )
        print('Encoding training vectors')
        train_vectors = encoder.encode(train_preprocessed)
        print('Encoding test vectors')
        test_vectors = encoder.encode(test_preprocessed)
    else:
        model = tools.load_model(None)
        test_save_path = os.path.join(
            DATA_PATH,
            'amazon_food/skip_thought_vecs/skip_thought_vecs_test_bi.npy')
        train_save_path = os.path.join(
            DATA_PATH,
            'amazon_food/skip_thought_vecs/skip_thought_vecs_train_bi.npy')
        print('Encoding training vectors')
        train_vectors = tools.encode(model, train_preprocessed)
        print('Encoding test vectors')
        test_vectors = tools.encode(model, test_preprocessed)

    np.save(train_save_path, train_vectors)
    np.save(test_save_path, test_vectors)
示例#8
0
def test_data(text_path):
    encoder = skipthoughts.Encoder(skipthoughts.load_model())

    testing_text = open(text_path).read().strip().split('\n')

    ids = []
    texts = []
    for line in testing_text:
        line = line.split(',')
        id = line[0]
        text = line[1]

        for i in range(5):
            ids.append((id, i))
            texts.append(text)

    captions = encoder.encode(texts)

    return ids, captions
示例#9
0
def generate_and_save_word_embeddings_for_sentences_text(
        input_file,
        embeddings_output_path,
        embeddings_id_output_file,
        for_testing=False,
        is_labeled=True):
    """ Generate the embeddings and save them in a different file
       Parameters:
       -----------
       input_file: string
       path to the file to load the data from

       embeddings_output_file_path: string
       path to the directory to save the embeddings in

       embeddings_id_output_file: string
       path to the file to save the embeddings id in
    """
    list_of_stories = load_and_process_text_data(input_file, for_testing,
                                                 is_labeled)
    model = skipthoughts.load_model()
    encoder = skipthoughts.Encoder(model)

    fout_id = open(embeddings_id_output_file, "wb")
    for story in list_of_stories:
        print(story)
        if not for_testing:
            embeddings = encoder.encode(
                story.get_story_with_right_ending_as_list())
        else:
            embeddings = encoder.encode(
                story.get_story_with_both_endings_as_list())
        output_file = open(embeddings_output_path + story.id, "wb")
        for embed in embeddings:
            b = bytes()
            b = b.join((struct.pack('f', e) for e in embed))
            output_file.write(b)
        print(story.id)
        output_file.close()
        fout_id.write(story.id)
        fout_id.write("\n")
    fout_id.close()
示例#10
0
def main():
    """
    Executes the entire pipeline of the code
    :return: void
    """
    gt = getGroundTruth()
    model_sum, gt_sum = [], []
    #print("Fetching encoder model...", end=" ")
    #enc_model = SentenceTransformer('bert-base-nli-mean-tokens')
    model = skipthoughts.load_model()
    encoder = skipthoughts.Encoder(model)
    #print("Done")
    for full_text, catch_phrases in gt:
        # Embed each sentence
        #sentence_embeddings = enc_model.encode(full_text)
        encoded = encoder.encode(full_text)
        # Cluster each embedding
        cluster_n = 11
        #clusters = cluster(sentence_embeddings, minimum_samples=cluster_n)
        clusters = cluster(encoded, minimum_samples=cluster_n)
        centroids = []
        for idx in range(cluster_n):
            centroid_id = np.where(clusters.labels_ == idx)[0]
            centroids.append(np.mean(centroid_id))

        # Select representative cluster
        closest, _ = pairwise_distances_argmin_min(clusters.cluster_centers_,
                                                   encoded)
        ordering = sorted(range(cluster_n), key=lambda k: centroids[k])
        print(ordering)
        summary = ' '.join([full_text[closest[idx]]
                            for idx in ordering]).replace('\n', ' ')
        model_sum.append(summary)
        print([(full_text[closest[idx]], closest[idx]) for idx in ordering])
        print(summary)
        print(len(catch_phrases))
        print(".".join(catch_phrases))
        gt_sum.append(".".join(catch_phrases))
        break
示例#11
0
def skipthought_encode(emails):
    """
    Obtains sentence embeddings for each sentence in the emails
    """
    enc_emails = [None] * len(emails)
    cum_sum_sentences = [0]
    sent_count = 0
    for email in emails:
        sent_count += len(email)
        cum_sum_sentences.append(sent_count)

    all_sentences = [sent for email in emails for sent in email]
    print('Loading pre-trained models...')
    model = skipthoughts.load_model()
    encoder = skipthoughts.Encoder(model)
    print('Encoding sentences...')
    enc_sentences = encoder.encode(all_sentences, verbose=False)

    for i in range(len(emails)):
        begin = cum_sum_sentences[i]
        end = cum_sum_sentences[i + 1]
        enc_emails[i] = enc_sentences[begin:end]
    return enc_emails
示例#12
0
 def summarize(self, sentences):
     # Getting sentence embeddings
     encoder = skipthoughts.Encoder(self.model)
     vectors = encoder.encode(sentences, verbose=False)
     print('Sentences have been encoded...')
     # Retrieving clusters
     n_clusters = int(np.ceil(len(vectors)**0.5))
     # n_clusters = int(np.ceil(SUMMARY_LENGTH))
     kmeans = KMeans(n_clusters=n_clusters, random_state=0)
     # print pca embeddings
     self.print_embeddings(vectors)
     kmeans.fit(vectors)
     avg = []
     for j in range(n_clusters):
         idx = np.where(kmeans.labels_ == j)[0]
         avg.append(np.mean(idx))
     # Choosing sentences closest to cluster centers
     closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,
                                                vectors)
     ordering = sorted(range(n_clusters), key=lambda k: avg[k])
     # Returning summary
     summary = ' '.join([sentences[closest[idx]] for idx in ordering])
     return summary
示例#13
0
def caption2Vectors(video_filename, video_captions):

    captions = []
    video_ctr = 0

    #Prepare processed lsit of captions.
    for caption in video_captions:
        caption = caption.decode('utf-8').strip()
        caption = caption.strip()
        caption = caption.rstrip('\n')
        captions.append(caption)

    print "Total number of captions are: "
    print len(captions)
    print "Initializing Skipthoughts vectorizer."

    model = skipthoughts.load_model()
    encoder = skipthoughts.Encoder(model)

    #first 2400 -> uni-skip model, last 2400 ->bi-skip model
    print "Starting Vectorizer conversion."
    vectors = []

    print "Skip-thought vector size is :"
    vectors = encoder.encode(captions)
    print vectors.shape

    #save captions as numpy files using corresponding filenames.
    for caption in vectors:
        file_name_video = video_filename[video_ctr]
        output_file = output_folder + str(file_name_video) + ".npy"

        with open(output_file, 'w') as f2:
            np.save(f2, caption)
            print str(file_name_video) + "    extracted"
        video_ctr += 1
示例#14
0
文件: utils.py 项目: zheng-yanan/CGMH
import pickle as pkl
if config.sim == 'word_max' or config.sim == 'combine':
    emb_word, emb_id = pkl.load(open(config.emb_path))

import sys
sys.path.insert(0, config.skipthoughts_path)
sys.path.insert(0, config.emb_path)
sys.path.insert(0, '../utils/dict_emb')
from dict_use import dict_use
dict_use = dict_use(config.dict_path)
sen2id = dict_use.sen2id
id2sen = dict_use.id2sen
if config.sim == 'skipthoughts' or config.sim == 'combine':
    import skipthoughts
    skip_model = skipthoughts.load_model()
    skip_encoder = skipthoughts.Encoder(skip_model)
if config.sim == 'word_max' or config.sim == 'combine':
    #id2freq=pkl.load(open('./data/id2freq.pkl'))
    pass


def normalize(x, e=0.05):
    tem = copy(x)
    if max(tem) == 0:
        tem += e
    return tem / tem.sum()


def reverse_seq(input, sequence_length, target):
    batch_size = input.shape[0]
    num_steps = input.shape[1]
示例#15
0
        solution = sentence.split(' ')[-1]
        solutions.append(int(solution))
        sentence = readline()

    return sentences, solutions


input_images = load_input_images('./dataset/input')
input_sentences, labels = load_input_sentences()

inputs = list(zip(input_images, input_sentences, labels))
random.shuffle(inputs)
input_images, input_sentences, labels = zip(*inputs)

model = skipthoughts.load_model()
encoder = skipthoughts.Encoder(model)
input_sentences = encoder.encode(input_sentences)


def load_input(path):
    input_images = load_input_images(path)
    input_sentences, labels = load_input_sentences()

    inputs = list(zip(input_images, input_sentences, labels))
    random.shuffle(inputs)
    input_images, input_sentences, labels = zip(*inputs)

    model = skipthoughts.load_model()
    encoder = skipthoughts.Encoder(model)
    input_sentences = encoder.encode(input_sentences)
示例#16
0
 def __init__(self):
     self.encoder = skipthoughts.Encoder(skipthoughts.load_model())
示例#17
0
 def __init__(self, **kwargs):
     self.model = skipthoughts.load_model()
     self.encoder = skipthoughts.Encoder(self.model)
示例#18
0
    return return_arr


@np.vectorize
def decode(x):
    return x.decode('utf8')


if __name__ == '__main__':
    from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter

    parser = ArgumentParser(
        description='Extract skipthought vectors',
        formatter_class=ArgumentDefaultsHelpFormatter)

    m = skipthoughts.Encoder(skipthoughts.load_model())

    args = parser.parse_args()

    vuamc = pd.read_csv('./data/vuamc.csv',
                        keep_default_na=False)
    vuamc.min_context = decode(vuamc.min_context)

    unique_ctx = vuamc.min_context.unique()
    ctx_embs = infer_vector_skipthought(m, unique_ctx)

    ctx_to_idx = {ctx: i for i, ctx in enumerate(unique_ctx)}

    v_embs = np.zeros((vuamc.shape[0], HIDDEN_DIM_SIZE), dtype=np.float32)
    s_embs = np.zeros((vuamc.shape[0], HIDDEN_DIM_SIZE), dtype=np.float32)
    o_embs = np.zeros((vuamc.shape[0], HIDDEN_DIM_SIZE), dtype=np.float32)