def encode_article(self): """ Method to embed sentences into sentence vectors. :return: List of encoded sentences """ sentences_to_encode = [] encoded_article = [None] sentence_list = [0] sent_count = 0 for sentence in self.article: sent_count += len(sentence) sentence_list.append(sent_count) for sent in sentence: sentences_to_encode.append(sent) # Split into sentences print('Loading models...'), model = skipthoughts.LoadModel().load_model() encoder = skipthoughts.Encoder(model) print('Encoding sentences...'), encoded_sentences = encoder.encode(sentences_to_encode, verbose=False) print('Done') for i in range(len(self.article)): begin = sentence_list[i] end = sentence_list[i + 1] encoded_article[i] = encoded_sentences[begin:end] return encoded_article
def load_inputs(path, limit=-1): ''' Returns captions,resnet_embeddings,vectors ''' # load captions and resnet embeddings split = pickle.load(open(path, "rb")) ret = None if limit != -1: split = split[:limit] if len(split[0]) == 2: # encode captions via skipthought, if there are not emebddings already include model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) captions, resnet_embeddings = [s[0] for s in split ], [s[1].flatten() for s in split] vectors = encoder.encode(captions) ret = captions, resnet_embeddings, vectors elif len(split[0]) == 3: captions, resnet_embeddings, vectors = [s[0] for s in split], [ s[1].flatten() for s in split ], [s[2].flatten() for s in split] vectors = np.array(vectors) ret = captions, resnet_embeddings, vectors else: raise "Input pickled vectors should be a list of two tuples (caption,resnet) or\ three-tuples (caption,resnet,embedding)" return ret
def skipthought_encode(sentences): """ Obtains sentence embeddings for each sentence in the text """ # print('Loading pre-trained models...') model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) # print('Encoding sentences...') encoded = encoder.encode(sentences) return encoded
def load_input(path): input_images = load_input_images(path) input_sentences, labels = load_input_sentences() inputs = list(zip(input_images, input_sentences, labels)) random.shuffle(inputs) input_images, input_sentences, labels = zip(*inputs) model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) input_sentences = encoder.encode(input_sentences) return input_images, input_sentences, labels
def transform_sentences(sentences): """ Builds sentence embeddings using the Skip-thoughts model. :param _df: Input data frame with column of sentences. :return: Torch matrix of embeddings, size 1024. """ model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) #sentences = _df['sentText'].tolist() _sent_embs = encoder.encode(sentences, verbose=True) _sent_tensors = [torch.from_numpy(j) for j in _sent_embs] return torch.stack(_sent_tensors)
def encodeSentences(reviews): enc_reviews = [None] * len(reviews) cum_sum_sentences = [0] sent_count = 0 for review in reviews: sent_count += len(review) cum_sum_sentences.append(sent_count) print('Loading pretrained model...') model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) print('Encoding sentences...') enc_sentences = encoder.encode(reviews, verbose=False) return enc_sentences
def get_pretrained_encodings(pretrained=False): ''' Get encodings using the pre-trained models. ''' word_set = set() dict_f = open(os.path.join(DATA_PATH, 'word2vec/dict.txt'), 'r') for line in dict_f: word_set.add(line.strip()) dict_f.close() # Getting the data. with open(os.path.join(DATA_PATH, 'amazon_food/train_data.pkl'), 'r') as f: train_data = cPickle.load(f) train_preprocessed = preprocess(train_data[0], word_set) with open(os.path.join(DATA_PATH, 'amazon_food/test_data.pkl'), 'r') as f: test_data = cPickle.load(f) test_preprocessed = preprocess(test_data[0], word_set) if pretrained: model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) test_save_path = os.path.join( DATA_PATH, 'amazon_food/skip_thought_vecs/skip_thought_vecs_test_pretrained.npy' ) train_save_path = os.path.join( DATA_PATH, 'amazon_food/skip_thought_vecs/skip_thought_vecs_train_pretrained.npy' ) print('Encoding training vectors') train_vectors = encoder.encode(train_preprocessed) print('Encoding test vectors') test_vectors = encoder.encode(test_preprocessed) else: model = tools.load_model(None) test_save_path = os.path.join( DATA_PATH, 'amazon_food/skip_thought_vecs/skip_thought_vecs_test_bi.npy') train_save_path = os.path.join( DATA_PATH, 'amazon_food/skip_thought_vecs/skip_thought_vecs_train_bi.npy') print('Encoding training vectors') train_vectors = tools.encode(model, train_preprocessed) print('Encoding test vectors') test_vectors = tools.encode(model, test_preprocessed) np.save(train_save_path, train_vectors) np.save(test_save_path, test_vectors)
def test_data(text_path): encoder = skipthoughts.Encoder(skipthoughts.load_model()) testing_text = open(text_path).read().strip().split('\n') ids = [] texts = [] for line in testing_text: line = line.split(',') id = line[0] text = line[1] for i in range(5): ids.append((id, i)) texts.append(text) captions = encoder.encode(texts) return ids, captions
def generate_and_save_word_embeddings_for_sentences_text( input_file, embeddings_output_path, embeddings_id_output_file, for_testing=False, is_labeled=True): """ Generate the embeddings and save them in a different file Parameters: ----------- input_file: string path to the file to load the data from embeddings_output_file_path: string path to the directory to save the embeddings in embeddings_id_output_file: string path to the file to save the embeddings id in """ list_of_stories = load_and_process_text_data(input_file, for_testing, is_labeled) model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) fout_id = open(embeddings_id_output_file, "wb") for story in list_of_stories: print(story) if not for_testing: embeddings = encoder.encode( story.get_story_with_right_ending_as_list()) else: embeddings = encoder.encode( story.get_story_with_both_endings_as_list()) output_file = open(embeddings_output_path + story.id, "wb") for embed in embeddings: b = bytes() b = b.join((struct.pack('f', e) for e in embed)) output_file.write(b) print(story.id) output_file.close() fout_id.write(story.id) fout_id.write("\n") fout_id.close()
def main(): """ Executes the entire pipeline of the code :return: void """ gt = getGroundTruth() model_sum, gt_sum = [], [] #print("Fetching encoder model...", end=" ") #enc_model = SentenceTransformer('bert-base-nli-mean-tokens') model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) #print("Done") for full_text, catch_phrases in gt: # Embed each sentence #sentence_embeddings = enc_model.encode(full_text) encoded = encoder.encode(full_text) # Cluster each embedding cluster_n = 11 #clusters = cluster(sentence_embeddings, minimum_samples=cluster_n) clusters = cluster(encoded, minimum_samples=cluster_n) centroids = [] for idx in range(cluster_n): centroid_id = np.where(clusters.labels_ == idx)[0] centroids.append(np.mean(centroid_id)) # Select representative cluster closest, _ = pairwise_distances_argmin_min(clusters.cluster_centers_, encoded) ordering = sorted(range(cluster_n), key=lambda k: centroids[k]) print(ordering) summary = ' '.join([full_text[closest[idx]] for idx in ordering]).replace('\n', ' ') model_sum.append(summary) print([(full_text[closest[idx]], closest[idx]) for idx in ordering]) print(summary) print(len(catch_phrases)) print(".".join(catch_phrases)) gt_sum.append(".".join(catch_phrases)) break
def skipthought_encode(emails): """ Obtains sentence embeddings for each sentence in the emails """ enc_emails = [None] * len(emails) cum_sum_sentences = [0] sent_count = 0 for email in emails: sent_count += len(email) cum_sum_sentences.append(sent_count) all_sentences = [sent for email in emails for sent in email] print('Loading pre-trained models...') model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) print('Encoding sentences...') enc_sentences = encoder.encode(all_sentences, verbose=False) for i in range(len(emails)): begin = cum_sum_sentences[i] end = cum_sum_sentences[i + 1] enc_emails[i] = enc_sentences[begin:end] return enc_emails
def summarize(self, sentences): # Getting sentence embeddings encoder = skipthoughts.Encoder(self.model) vectors = encoder.encode(sentences, verbose=False) print('Sentences have been encoded...') # Retrieving clusters n_clusters = int(np.ceil(len(vectors)**0.5)) # n_clusters = int(np.ceil(SUMMARY_LENGTH)) kmeans = KMeans(n_clusters=n_clusters, random_state=0) # print pca embeddings self.print_embeddings(vectors) kmeans.fit(vectors) avg = [] for j in range(n_clusters): idx = np.where(kmeans.labels_ == j)[0] avg.append(np.mean(idx)) # Choosing sentences closest to cluster centers closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, vectors) ordering = sorted(range(n_clusters), key=lambda k: avg[k]) # Returning summary summary = ' '.join([sentences[closest[idx]] for idx in ordering]) return summary
def caption2Vectors(video_filename, video_captions): captions = [] video_ctr = 0 #Prepare processed lsit of captions. for caption in video_captions: caption = caption.decode('utf-8').strip() caption = caption.strip() caption = caption.rstrip('\n') captions.append(caption) print "Total number of captions are: " print len(captions) print "Initializing Skipthoughts vectorizer." model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) #first 2400 -> uni-skip model, last 2400 ->bi-skip model print "Starting Vectorizer conversion." vectors = [] print "Skip-thought vector size is :" vectors = encoder.encode(captions) print vectors.shape #save captions as numpy files using corresponding filenames. for caption in vectors: file_name_video = video_filename[video_ctr] output_file = output_folder + str(file_name_video) + ".npy" with open(output_file, 'w') as f2: np.save(f2, caption) print str(file_name_video) + " extracted" video_ctr += 1
import pickle as pkl if config.sim == 'word_max' or config.sim == 'combine': emb_word, emb_id = pkl.load(open(config.emb_path)) import sys sys.path.insert(0, config.skipthoughts_path) sys.path.insert(0, config.emb_path) sys.path.insert(0, '../utils/dict_emb') from dict_use import dict_use dict_use = dict_use(config.dict_path) sen2id = dict_use.sen2id id2sen = dict_use.id2sen if config.sim == 'skipthoughts' or config.sim == 'combine': import skipthoughts skip_model = skipthoughts.load_model() skip_encoder = skipthoughts.Encoder(skip_model) if config.sim == 'word_max' or config.sim == 'combine': #id2freq=pkl.load(open('./data/id2freq.pkl')) pass def normalize(x, e=0.05): tem = copy(x) if max(tem) == 0: tem += e return tem / tem.sum() def reverse_seq(input, sequence_length, target): batch_size = input.shape[0] num_steps = input.shape[1]
solution = sentence.split(' ')[-1] solutions.append(int(solution)) sentence = readline() return sentences, solutions input_images = load_input_images('./dataset/input') input_sentences, labels = load_input_sentences() inputs = list(zip(input_images, input_sentences, labels)) random.shuffle(inputs) input_images, input_sentences, labels = zip(*inputs) model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) input_sentences = encoder.encode(input_sentences) def load_input(path): input_images = load_input_images(path) input_sentences, labels = load_input_sentences() inputs = list(zip(input_images, input_sentences, labels)) random.shuffle(inputs) input_images, input_sentences, labels = zip(*inputs) model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) input_sentences = encoder.encode(input_sentences)
def __init__(self): self.encoder = skipthoughts.Encoder(skipthoughts.load_model())
def __init__(self, **kwargs): self.model = skipthoughts.load_model() self.encoder = skipthoughts.Encoder(self.model)
return return_arr @np.vectorize def decode(x): return x.decode('utf8') if __name__ == '__main__': from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter parser = ArgumentParser( description='Extract skipthought vectors', formatter_class=ArgumentDefaultsHelpFormatter) m = skipthoughts.Encoder(skipthoughts.load_model()) args = parser.parse_args() vuamc = pd.read_csv('./data/vuamc.csv', keep_default_na=False) vuamc.min_context = decode(vuamc.min_context) unique_ctx = vuamc.min_context.unique() ctx_embs = infer_vector_skipthought(m, unique_ctx) ctx_to_idx = {ctx: i for i, ctx in enumerate(unique_ctx)} v_embs = np.zeros((vuamc.shape[0], HIDDEN_DIM_SIZE), dtype=np.float32) s_embs = np.zeros((vuamc.shape[0], HIDDEN_DIM_SIZE), dtype=np.float32) o_embs = np.zeros((vuamc.shape[0], HIDDEN_DIM_SIZE), dtype=np.float32)