def transform(self, X): list_of_emb = [] size_of_emb = stacked_embeddings.embedding_length if not isinstance(X, str): for doc in X: p_str = parse_string(doc) if not p_str: list_of_emb.append( np.zeros((size_of_emb, ), dtype=np.float32) ) ##TODO: don't hard code vector size else: a_set = Sentence(p_str) stacked_embeddings.embed(a_set) list_of_emb.append( a_set.get_embedding().cpu().detach().numpy()) to_ret = np.array(list_of_emb) else: try: p_str = parse_string(X) if not p_str: to_ret = np.zeros((size_of_emb, ), dtype=np.float32) ##TODO here too else: a_set = Sentence(p_str) stacked_embeddings.embed(a_set) to_ret = a_set.get_embedding().cpu().detach().numpy( ).reshape(1, -1) except: print(type(X)) print(X) return to_ret
def cosine_embedding(sentence1, sentence2, model): embeddings = DocumentPoolEmbeddings(model, mode='mean') s1 = Sentence(sentence1) s2 = Sentence(sentence2) e1 = embeddings.embed(s1) e2 = embeddings.embed(s2) v1 = s1.get_embedding() v2 = s2.get_embedding() #print(v1, v2) #check that you don't get empty tensors cos_sim = dot(v1, v2) / (norm(v1) * norm(v2)) #print(cos_sim) return cos_sim #lies in [-1, 1].
def is_difference_large(text1, text2): text1_preproccessed = Sentence(text1) text2_preproccessed = Sentence(text2) glove_embedding = WordEmbeddings('glove') document_embedding = DocumentPoolEmbeddings([glove_embedding]) document_embedding.embed(text1_preproccessed) document_embedding.embed(text2_preproccessed) text1_embedding = text1_preproccessed.get_embedding() text2_embedding = text2_preproccessed.get_embedding() text1_embedding = np.reshape(text1_embedding, (-1, 1)) text2_embedding = np.reshape(text2_embedding, (-1, 1)) similarity = cosine_similarity(text1_embedding, text2_embedding) print(np.mean(similarity))
def embedize(self, data_subset_list): tweet = Sentence(data_subset_list) embedding = TransformerDocumentEmbeddings(self.embedding) embedding.embed(tweet) tweet_emb = tweet.get_embedding() tweet_emb_np = tweet_emb.detach().numpy() return (tweet_emb_np)
def embed(card_tag, card_as_sentence, card_words, card_words_org): stacked_embeddings.embed(card_tag) #stacked_embeddings.embed(card_as_sentence) #print(card_as_sentence.get_embedding().reshape(1,-1)) word_list = [] token_removed_ct = 0 card_tag_emb = card_tag.get_embedding() if sent_level == False: for word, count in zip(card_words_org, range(0, len(card_words_org))): n_gram_word = card_words[count] stacked_embeddings.embed(n_gram_word) n_gram_emb = n_gram_word.get_embedding() if graph: doc_embeddings.append(n_gram_emb.numpy()) word_sim = cos(card_tag_emb.reshape(1, -1), n_gram_emb.reshape(1, -1)) word_tup = (card_words_org[count], word_sim ) #card_words_org[count] word_list.append(word_tup) if graph: doc_embeddings.append(card_tag_emb.numpy()) print(len(word_list)) print(len(card_words)) print(len(card_words_org)) else: for sentence in card_as_sentence: set_obj = Sentence(sentence) stacked_embeddings.embed(set_obj) sentence_emb = set_obj.get_embedding() word_sim = cos(card_tag_emb.reshape(1, -1), sentence_emb.reshape(1, -1)) sentence_tup = (sentence, word_sim) word_list.append(sentence_tup) return word_list
def generate_topics_on_series(series): """https://towardsdatascience.com/covid-19-with-a-flair-2802a9f4c90f Returns: [type]: [description] """ validate_text(series) # initialise embedding classes flair_embedding_forward = FlairEmbeddings("news-forward") flair_embedding_backward = FlairEmbeddings("news-backward") bert_embedding = BertEmbeddings("bert-base-uncased") # combine word embedding models document_embeddings = DocumentPoolEmbeddings( [bert_embedding, flair_embedding_backward, flair_embedding_forward]) # set up empty tensor X = torch.empty(size=(len(series.index), 7168)).cuda() # fill tensor with embeddings i = 0 for text in tqdm(series): sentence = Sentence(text) document_embeddings.embed(sentence) embedding = sentence.get_embedding() X[i] = embedding i += 1 X = X.cpu().detach().numpy() torch.cuda.empty_cache() return X
def predict(self, X: List[str], n_labels: int = 10) -> np.array: if not hasattr(self, 'tag_embeddings_'): raise NotFittedError if self.verbose: X_iterator = tqdm( X, desc='Computing embeddings for prediction samples') else: X_iterator = X X_embeddings = [] for doc in X_iterator: doc_obj = Sentence(doc) self.document_embedder_.embed(doc_obj) X_embeddings.append(doc_obj.get_embedding().detach().numpy()) nn = NearestNeighbors(metric=self.distance_metric, n_neighbors=n_labels, n_jobs=self.n_jobs) nn.fit(self.tag_embeddings_) y_pred = lil_matrix((len(X), self.tag_embeddings_.shape[0]), dtype='int8') for sample_ind, text_embedding in enumerate(X_embeddings): nearest_neighbors = nn.kneighbors([text_embedding])[1][0] y_pred[sample_ind, nearest_neighbors] = 1 return y_pred.tocsr()
def fit(self, X, y): tag_docs = self._create_tag_corpus(X, self._create_tag_docs(y)) self.document_embedder_ = DocumentPoolEmbeddings( self.word_embeddings, pooling=self.pooling, fine_tune_mode=self.fine_tune_mode) if self.verbose: doc_iterator = tqdm(tag_docs, desc='Computing tag embeddings') else: doc_iterator = tag_docs self.tag_embeddings_ = [] for doc in doc_iterator: doc_obj = Sentence(doc) self.document_embedder_.embed(doc_obj) self.tag_embeddings_.append( doc_obj.get_embedding().detach().numpy()) self.tag_embeddings_ = np.array(self.tag_embeddings_) return self
def phrase_to_docvec(phrase: str, doc_embedding=doc_embedding): # need to crop the phrase to the first 2000 characters unless I want to look up more things. Should be good enough. phrase_s = Sentence(phrase[:2000].lower(), use_tokenizer=True) # need to lower, since we're using the uncased version doc_embedding.embed(phrase_s) phrase_emb_tensor = phrase_s.get_embedding() # get_embedding() returns a torch.FloatTensor return np.array(phrase_emb_tensor.data)
def get_vector(sen, document_embeddings): """Get the document vector for input.""" sentence = Sentence(sen) document_embeddings.embed(sentence) get_embed = sentence.get_embedding().detach() if get_embed.device.type == 'cuda': return get_embed.cpu().numpy() return get_embed.numpy()
def get_similarities(self, query): """ get the similarities between the query and all docs in corpus (WordEmbeddingModel) :param query: :return: the list of similarities [sim_between_query_and_first_doc,sim_between_query_and_second_doc...] """ seq = Sentence(query) self.vectorizer.embed(seq) query_vector = seq.get_embedding().tolist() return cosine_similarity([query_vector], self.X).flatten()
def test_fine_tunable_flair_embedding(): language_model_forward = LanguageModel(Dictionary.load( 'chars'), is_forward_lm=True, hidden_size=32, nlayers=1) embeddings = DocumentRNNEmbeddings([FlairEmbeddings( language_model_forward, fine_tune=True)], hidden_size=128, bidirectional=False) sentence = Sentence('I love Berlin.') embeddings.embed(sentence) assert (len(sentence.get_embedding()) == 128) assert (len(sentence.get_embedding()) == embeddings.embedding_length) sentence.clear_embeddings() assert (len(sentence.get_embedding()) == 0) embeddings = DocumentLMEmbeddings( [FlairEmbeddings(language_model_forward, fine_tune=True)]) sentence = Sentence('I love Berlin.') embeddings.embed(sentence) assert (len(sentence.get_embedding()) == 32) assert (len(sentence.get_embedding()) == embeddings.embedding_length) sentence.clear_embeddings() assert (len(sentence.get_embedding()) == 0)
def compute_embedding(embedding, remove_punctuation: bool, file_name: str): """ Computes the embedding with given model for all arguments :param embedding: Model :param remove_punctuation: Bool to indicate if punctuation should be removed :param file_name: """ arguments = Arguments() document_embedding = DocumentPoolEmbeddings([embedding]) embedded_arguments = {} for argument in arguments.ground_truth_arguments: premises = argument['premises'] conclusion = argument['conclusion'] conclusion_text = conclusion['conclusion_text'] if remove_punctuation: conclusion_text = remove_punctuations(conclusion_text) conclusion_sentence = Sentence(conclusion_text) document_embedding.embed(conclusion_sentence) embedded_conclusion = conclusion_sentence.get_embedding().detach( ).numpy().tolist() embedded_premises = {} argument_uid = None for premise in premises: premise_text = premise[1] if remove_punctuation: premise_text = remove_punctuations(premise_text) premise_sentence = Sentence(premise_text) document_embedding.embed(premise_sentence) embedded_premise = premise_sentence.get_embedding().detach().numpy( ).tolist() embedded_premises[premise[2]] = embedded_premise argument_uid = premise[0] embedded_arguments[argument_uid] = [ embedded_conclusion, embedded_premises ] save_embedding(embedded_arguments, file_name)
def is_difference_large(text1: str, text2: str) -> bool: text1_preprocessed = Sentence(text1) text2_preprocessed = Sentence(text2) glove_embedding = WordEmbeddings('glove') document_embedding = DocumentPoolEmbeddings([glove_embedding]) document_embedding.embed(text1_preprocessed) document_embedding.embed(text2_preprocessed) text1_embedding = text1_preprocessed.get_embedding() text2_embedding = text2_preprocessed.get_embedding() text1_embedding = np.reshape(text1_embedding, (-1, 1)) text2_embedding = np.reshape(text2_embedding, (-1, 1)) similarity = cosine_similarity(text1_embedding, text2_embedding) # Some example treshold # TODO: Determine a good threshold for example in pitch if np.mean(similarity) > 0.06: return False else: return True
def getBertVector(str): # create a sentence #print(str) sentence = Sentence(str) # embed words in sentence document_embeddings.embed(sentence) # print(str) # print(sentence.get_embedding().detach().numpy()) return sentence.get_embedding().detach().numpy()
def create_corpus(pdf_folder): corpus = [] document_embeddings = embedding() for file1 in os.listdir(pdf_folder): if file1.endswith(".pdf"): pdf = pdfparser(pdf_folder + file1) sentence = Sentence(pdf) document_embeddings.embed(sentence) corpus.append(sentence.get_embedding().detach().numpy()) #Save corpus to a pickle file with open('corpus.pkl', 'wb') as f: pickle.dump(corpus, f)
def similaridade(frase, tipo_embeding): sentence = Sentence(frase, use_tokenizer=True) embeddings[tipo_embeding].embed(sentence) vetor_frase = sentence.get_embedding() lista_similaridade = [] for i in dicio_vetores[tipo_embeding]: # Primeira posição do vetor é a frase e a segunda o vetor de embedings lista_similaridade.append((arcos(vetor_frase,i[1]), i[0])) lista_similaridade.sort(reverse = False) return lista_similaridade[:10]
def createFlairEmbeddings(embedding_list, data): embeddings = [] sentences = data['Interest_Name'].values model = DocumentPoolEmbeddings(embedding_list, fine_tune_mode='nonlinear') if __name__ == "__main__": for sent in sentences: sentence = Sentence(sent) model.embed(sentence) modeled_embedding = sentence.get_embedding() array = modeled_embedding.cpu().detach().numpy() embeddings.append(array) return embeddings
def get_vector(sen, document_embeddings): """Get the document vector for input.""" if isinstance(sen, pd.Series): sentence = [Sentence(i) for i in sen] document_embeddings.embed(sentence) get_embed = [i.get_embedding().detach() for i in sentence] if any([i.device.type == 'cuda' for i in get_embed]): return [x.cpu().numpy() for x in get_embed] return [x.numpy() for x in get_embed] else: sentence = Sentence(sen) document_embeddings.embed(sentence) get_embed = sentence.get_embedding().detach() if get_embed.device.type == 'cuda': return get_embed.cpu().numpy() return get_embed.numpy()
def preprocess_line(original_line, lem=True, stem=True, embed=True, remove_stop_words=True, extra_features=False): tokenizer = RegexpTokenizer(r"(?u)\b\w\w+\b") # lower sent line = original_line.lower() line = tokenizer.tokenize(line) if extra_features: features = _extract_features(original_line, line) if lem: lemmatizer = WordNetLemmatizer() line = [lemmatizer.lemmatize(word) for word in line] if stem: stemmer = PorterStemmer() line = [stemmer.stem(word) for word in line] if remove_stop_words: stop_words = set(stopwords.words('english')) new_line = [word for word in line if not word in stop_words] # n_stop_words if extra_features: features.append(len(line) - len(new_line)) line = new_line line = " ".join(line) if embed: try: sentence = Sentence(line) _EMBEDDER.embed(sentence) line = sentence.get_embedding().cpu().detach().numpy() except Exception: return None if extra_features: # concat features at the end! np.concatenate((line, np.asarray(features))) return line
def decision_function(self, X: List[str], n_labels: int = 10): if not hasattr(self, 'tag_embeddings_'): raise NotFittedError if self.verbose: X_iterator = tqdm( X, desc='Computing embeddings for prediction samples') else: X_iterator = X X_embeddings = [] for doc in X_iterator: if doc: doc_obj = Sentence(doc) else: doc_obj = Sentence('Unkown') print('yeah') self.document_embedder_.embed(doc_obj) try: X_embeddings.append(doc_obj.get_embedding().detach().numpy()) except RuntimeError as e: print( 'Could no compute embedding for sample inserting zero vector' ) # TODO give index of corrupted sample print(e) X_embeddings.append( np.zeros((self.tag_embeddings_[1], ), dtype=self.tag_embeddings_.dtype)) nn = NearestNeighbors(metric=self.distance_metric, n_neighbors=n_labels, n_jobs=self.n_jobs) nn.fit(self.tag_embeddings_) y_pred = lil_matrix((len(X), self.tag_embeddings_.shape[0]), dtype='float') for sample_ind, sample_vec in enumerate(X_embeddings): distances, indices = nn.kneighbors([sample_vec]) for distance, label_index in zip(distances, indices): y_pred[sample_ind, label_index] = distance return y_pred.tocsr()
def compute_pretrained_individual_transformer_embedding( query, word_embedding, document_embeddings): """ :param query: String :: arbitrary sentence :param word_embedding :param document_embeddings :return: n-dimensional embedding """ tokenized_text = word_embedding.tokenizer.tokenize(query) tokenized_len = len(tokenized_text) while tokenized_len > 512: query = query[:len(query) // 2] tokenized_text = word_embedding.tokenizer.tokenize(query) tokenized_len = len(tokenized_text) sentence = Sentence(query) document_embeddings.embed(sentence) tensor_query_embedding = sentence.get_embedding() numpy_query_embedding = tensor_query_embedding.data.cpu().numpy() return numpy_query_embedding
def transform(self, X, y=None, **kwargs): """ an abstract method that is used to transform according to what happend in the fit method :param X: features - Dataframe :param y: target vector - Series :param kwargs: free parameters - dictionary :return: X: the transformed data - Dataframe """ X = X['text'] dataset_hash = hash(str(X) + str(self.embedder.__dict__)) if dataset_hash in self.dataset_cache: return self.dataset_cache[dataset_hash] else: embeddings = [] for first in trange(0, len(X), self.batch_size): subset = X[first:first + self.batch_size] sentences = [] for element in subset: sentence = Sentence(element) # sentence.tokens = sentence.tokens[:200] sentences.append(sentence) self.embedder.embed(sentences) for sentence in sentences: key = sentence.to_original_text() if key in self.vector_cache.keys(): vector = self.vector_cache[key] else: vector = sentence.get_embedding().cpu().detach().numpy( ) self.vector_cache[key] = vector embeddings.append(vector) embedding_dataset = numpy.vstack(embeddings) self.dataset_cache[dataset_hash] = embedding_dataset return embedding_dataset
def embedd_document_p(self, document: str, doc_id: str) -> Tuple[Tensor, str]: flair_doc = Sentence(document) self.document_embedding.embed(flair_doc) return flair_doc.get_embedding().detach().numpy(), doc_id
def _embed_document(self, document_text: str, doc_embeddings: DocumentPoolEmbeddings): sentence = Sentence(document_text) doc_embeddings.embed(sentence) return sentence.get_embedding().data.cpu().numpy()
# + hidden=true lyrics_embeddings = all_song_lyrics.loc[:, ['key', 'lyrics_clean']].set_index( 'key') all_embeddings = {} count = 0 for i in lyrics_embeddings.index.tolist(): count += 1 if count % 100 == 0: print(count) text = lyrics_embeddings.loc[i, 'lyrics_clean'] try: sentence = Sentence(text) document_embeddings.embed(sentence) numpy_array = sentence.get_embedding().detach().numpy() all_embeddings[i] = numpy_array except: all_embeddings[i] = [numpy_array] flair_doc_embeddings = pd.DataFrame.from_dict(all_embeddings, orient='index') flair_doc_embeddings.to_csv('flair_embeddings.csv') # - # ### Final dataset os.chdir(data_folder) flair_sentiment_df = pd.read_csv('./flair_sentiment.csv').set_index('key') flair_sentiment_df.columns = ['flair_sentiment'] song_lyrics = all_song_lyrics.set_index('spotify_id').copy()
def training_pipeline_bert(filepath=None, num_words_to_print=10, prefix=None, min_topics=19, max_topics=19, step=2): logging.info(f'Started training_pipeline : {min_topics}-{max_topics}') start = datetime.datetime.now() if filepath is not None: filepath = data_dir_local / filepath else: logging.error("Please enter file name") exit() if max_topics is None: logging.error("Please enter a valid topic number to train model") exit() #logging.info(f'preprocessor.process_data_save: {filepath}') #preprocessor.process_data_save(filepath=filepath, as_text=as_text, as_pickle=as_pickle, verbose=verbose) #logging.info(f'phraser.raw_to_phrased_data_pipeline...') #phraser.raw_to_phrased_data_pipeline(to_load='text', verbose=True, overwrite_interim=True, prefix=None) col = cols[0] df = phraser.load_phrased_data_pipeline(to_load='text', verbose=True, overwrite_interim=True, prefix=None, training=True, col='resp_whytfa') #if prefix is None: # prefix = '' # for topic modeling #trigram_docs_filepath = data_dir_processed / f'{prefix}{col}_transformed_docs_all.txt' #trigram_docs_filepath = f'/home/watsonrtdev/topic_modeling/input_data/topic_modeling/training/processed/{prefix}{col}_transformed_docs_all.txt' #trigram_docs_filepath = f'/home/watsonrtdev/topic_modeling/input_data/topic_modeling/training/processed/processed_dataframe.csv' #print(f'Loading input file {trigram_docs_filepath}') # turn to posix filepaths until gensim supports this #trigram_docs_filepath = trigram_docs_filepath.as_posix() #trigram_docs = LineSentence(trigram_docs_filepath) #df = pd.read_csv(trigram_docs_filepath) #print(df.columns) #default it to min/max topics num_topics_range = range(min_topics, max_topics + 1, step) #if num_topics is not None: # num_topics_range = range(num_topics, num_topics + 1, step) print('Num_topics_range={}'.format(num_topics_range)) #Contextual string embeddings are powerful embeddings that capture latent syntactic-semantic information that goes beyond standard word embeddings. Key differences are: (1) they are trained without any explicit notion of words and thus fundamentally model words as sequences of characters. And (2) they are contextualized by their surrounding text, meaning that the same word will have different embeddings depending on its contextual use. # initialise embedding classes flair_embedding_forward = FlairEmbeddings('news-forward') flair_embedding_backward = FlairEmbeddings('news-backward') bert_embedding = BertEmbeddings('bert-base-uncased') # combine word embedding models document_embeddings = DocumentPoolEmbeddings( [bert_embedding, flair_embedding_backward, flair_embedding_forward]) # set up empty tensor X = torch.empty(size=(len(df.index), 7168)) #.cuda() # fill tensor with embeddings # for text in tqdm(df['resp_whytfa']): #df['text_cl']): #from tqdm import tqdm - show smart progress meter i = 0 for text in df['resp_whytfa']: sentence = Sentence(text) document_embeddings.embed(sentence) embedding = sentence.get_embedding() X[i] = embedding i += 1 if (i > 100): break print("before the PCA") #detach the tensor from the GPU and convert it to a NumPy array Y = X.cpu().detach().numpy() #del(X) #torch.cuda.empty_cache() #We want to cluster these vectors into topics, and we’ll invoke Agglomerative Clustering with Ward affinity from scikit-learn to do so. #Bottom-up hierarchical clustering algorithms have a memory complexity of O(n²), so we’ll use Principal Component Analysis to speed up this process. #As a side note, I did test a number of clustering algorithms (K-means, BIRCH, DBSCAN, Agglomerative with complete/average affinity), but Ward seems to perform the best in most cases #reduce the dimensionality of our vectors to length 768 pca = IncrementalPCA(copy=False, n_components=768, batch_size=1000) #pca = PCA(n_components=768) X_red = pca.fit_transform(X) del (X) print("After the fit_transform") N_CLUSTERS = 5 # WARD CLUSTER ward = AgglomerativeClustering(n_clusters=N_CLUSTERS, affinity='euclidean', linkage='ward') pred_ward = ward.fit_predict(X_red) print("After fit_predict") df['topic'] = pred_ward df.to_csv('bert_withtopic.csv') print("Write bert_withtopic.csv") #get topic composition topic_docs = [] # group text into topic-documents for topic in range(N_CLUSTERS): topic_docs.append(' '.join( df[df['cluster'] == topic]['text_cl'].values)) # apply function df_tfidf = get_top_words(topic_docs, 10) print(f"Top words: df_tfidf") #How good are our topics? #We find the centroids of the vectors by averaging them across each topic: topic_centroids = [] for topic in tqdm(range(N_CLUSTERS)): X_topic = X_red[df.index[df['cluster'] == topic]] X_mean = np.mean(X_topic, axis=0) topic_centroids.append(X_mean) #calculate the euclidean distance of each Tweet vector to their respective topic centroid: topic_distances = [] for row in tqdm(df.index): topic_centroid = topic_centroids[df.iloc[row]['cluster']] X_row = X_red[row] topic_distance = euclidean(topic_centroid, X_row) topic_distances.append(topic_distance) df['topic_distance'] = topic_distances #visualise the distribution of distances to the topic centroid #The closer the distribution to the left of the graph, the more compact the topic is df.to_csv('bert_withtopic_distance.csv') print('Write bert_withtopic_distance.csv') #topic similarity - how similar the topics are to each other #We will construct a euclidean distance matrix between the 10 topic centroids to find the distance between the topic averages df_dist_matrix = pd.DataFrame(distance_matrix(topic_centroids, topic_centroids), index=range(N_CLUSTERS), columns=range(N_CLUSTERS)) print(f"df_dist_matrix={df_dist_matrix}") with open('df_dist_matrix', 'w') as fout: fout.write(u'#' + '\t'.join(str(e) for e in df_dist_matrix.shape) + '\n') df_dist_matrix.tofile(fout)
def embedd_document(self, document: str) -> Tensor: flair_doc = Sentence(document) self.document_embedding.embed(flair_doc) return flair_doc.get_embedding().detach().numpy()
explain_pred(str(to_process)) label = input( "What is the ground truth label of this? Seperate labels with a space" ) if label == "": pass elif label == "f": break elif label == "stop": csvfile.close() if keras and increment: pipe.named_steps['model'].model.save('keras_model.h5') pipe.named_steps['model'].model = None joblib.dump(pipe, 'saved_card_classification.pkl') print("Model Dumped!!!!") done = True sys.exit() else: the_labels = label.split() if increment == True: t_model = pipe.named_steps['model'] ppset = Sentence(str(to_process)) stacked_embeddings.embed(ppset) the_emb = ppset.get_embedding().cpu().detach().numpy().reshape( 1, -1) t_model.partial_fit( the_emb, the_labels) ##INCREMENTAL LEARNING MODE ENGAGED the_labels.append(str(to_process)) spamwriter.writerow(the_labels) csvfile.flush()
def compute_elmo_embedding(keyword): sentence = Sentence(keyword) document_embedding.embed(sentence) return sentence.get_embedding().detach().cpu().numpy()