def generate_topics_on_series(series): """https://towardsdatascience.com/covid-19-with-a-flair-2802a9f4c90f Returns: [type]: [description] """ validate_text(series) # initialise embedding classes flair_embedding_forward = FlairEmbeddings("news-forward") flair_embedding_backward = FlairEmbeddings("news-backward") bert_embedding = BertEmbeddings("bert-base-uncased") # combine word embedding models document_embeddings = DocumentPoolEmbeddings( [bert_embedding, flair_embedding_backward, flair_embedding_forward]) # set up empty tensor X = torch.empty(size=(len(series.index), 7168)).cuda() # fill tensor with embeddings i = 0 for text in tqdm(series): sentence = Sentence(text) document_embeddings.embed(sentence) embedding = sentence.get_embedding() X[i] = embedding i += 1 X = X.cpu().detach().numpy() torch.cuda.empty_cache() return X
class DefaultFeaturizerForMultiLabelRank(ObservationFeaturizer): def __init__(self, action_space: ActionSpace, embedding_type: str = "fasttext", pre_process: bool = False, device: str = "cpu"): self.device = device self.pre_process = pre_process self.text_pre_processor = TextPreProcessor(language="english") self._setup_device() embeddings = EmbeddingRegistry.get_embedding(embedding_type) self.doc_embeddings = DocumentPoolEmbeddings(embeddings).to( torch.device(self.device)) self.action_space = action_space self._current_input_embeddings = None def _setup_device(self): flair.device = torch.device(self.device) def init_on_reset(self, input_text: Union[List[str], str]): # pooled document embeddings text = self.text_pre_processor.process( input_text) if self.pre_process else input_text sent = Sentence(text) self.doc_embeddings.embed(sent) self._current_input_embeddings = torch.tensor( sent.embedding.cpu().detach().numpy()) def featurize(self, observation: Observation) -> torch.Tensor: input_vector = self._current_input_embeddings context_vector = self._featurize_context( observation.get_current_action_history()) concatenated = torch.cat((input_vector, context_vector), dim=0) return concatenated def get_observation_dim(self) -> int: return self._get_input_dim() + self._get_context_dim() def _featurize_input(self, input_index: int) -> torch.Tensor: # the input does not change on each step return self._current_input_embeddings def _featurize_context(self, context: List[str]) -> torch.Tensor: # bag of actions representation context_vector = torch.zeros(self.action_space.size()) action_indices = [ self.action_space.action_to_ix(action) for action in context ] context_vector[action_indices] = 1.0 return context_vector def _get_input_dim(self): sent = Sentence("A random text to get the embedding dimension") self.doc_embeddings.embed(sent) dim = sent[0].embedding.shape[0] sent.clear_embeddings() return dim def _get_context_dim(self): return self.action_space.size()
def get_embeddings(self, sentence): # document_embeddings = DocumentPoolEmbeddings( # [self.glove_embedding, # initialize the document embeddings, mode = mean # self.flair_embedding_backward, # self.flair_embedding_forward]) # Glove + BPE document_embeddings = DocumentPoolEmbeddings( [self.glove_embedding, self.bpe_embedding]) # Nilc fasttext 600 emdedding #document_embeddings = DocumentPoolEmbeddings( # [self.fast_text_embedding]) # Flair #document_embeddings = DocumentPoolEmbeddings( # [self.flair_embedding_forward]) # ElMO #document_embeddings = DocumentPoolEmbeddings( # [self.elmo_embedding]) # create an example sentence sentence = Sentence(sentence) # embed the sentence with our document embedding document_embeddings.embed(sentence) # now check out the embedded sentence. return sentence.get_embedding()
class FlairEncoder(BaseTextEncoder): is_trained = True def __init__(self, word_embedding: str = 'glove', flair_embeddings: Tuple[str] = ('news-forward', 'news-backward'), pooling_strategy: str = 'mean', *args, **kwargs): super().__init__(*args, **kwargs) self.word_embedding = word_embedding self.flair_embeddings = flair_embeddings self.pooling_strategy = pooling_strategy def post_init(self): from flair.embeddings import DocumentPoolEmbeddings, WordEmbeddings, FlairEmbeddings self._flair = DocumentPoolEmbeddings([ WordEmbeddings(self.word_embedding), FlairEmbeddings(self.flair_embeddings[0]), FlairEmbeddings(self.flair_embeddings[1]) ], pooling=self.pooling_strategy) @batching @as_numpy_array def encode(self, text: List[str], *args, **kwargs) -> np.ndarray: from flair.data import Sentence import torch # tokenize text batch_tokens = [Sentence(v) for v in text] self._flair.embed(batch_tokens) return torch.stack([v.embedding for v in batch_tokens]).detach()
class Embedding: def __init__(self, filepath: Union[Path, str]): self._bert = BertEmbeddings(filepath) self.document_embeddings = DocumentPoolEmbeddings([self._bert]) def get_vector(self, text: str) -> np.ndarray: sentence = Sentence(text, use_tokenizer=False) self.document_embeddings.embed(sentence) with torch.no_grad(): vector = sentence.get_embedding() return vector.numpy() def process_reference_sentences(self, sentences: Dict[str, List[str]]): for name, sentence in sentences.items(): try: sentence = " ".join(sentence) yield name, self.get_vector(sentence) except RuntimeError: logging.error("Oops! Reference sentence too long...") def process_batch(self, documents: Documents): for name, sentence in self.build_sentences(documents): logging.info(f"Processing {name}...") try: yield name, self.get_vector(sentence) except RuntimeError: logging.error("Oops! Sentence too long..") def build_sentences(self, documents: Documents): for name, sentences in documents.items(): for index, sentence in sentences.items(): yield f"{name}-{index}", " ".join(sentence)
class EmbeddingSimilarityTransformer(CustomTransformer): _modules_needed_by_name = [ 'regex==2018.1.10', 'flair==0.4.1', 'segtok==1.5.7' ] _is_reproducible = False _can_use_gpu = True _repl_val = 0 def __init__(self, embedding_name, **kwargs): super().__init__(**kwargs) self.embedding_name = embedding_name @staticmethod def get_default_properties(): return dict(col_type="text", min_cols=2, max_cols=2, relative_importance=1) @staticmethod def get_parameter_choices(): return {"embedding_name": ["glove", "en", "bert"]} @property def display_name(self): name_map = {"glove": "Glove", "en": "FastText", "bert": "BERT"} return "%sEmbedding_CosineSimilarity" % name_map[self.embedding_name] def fit_transform(self, X: dt.Frame, y: np.array = None): X.replace([None, math.inf, -math.inf], self._repl_val) return self.transform(X) def transform(self, X: dt.Frame): X.replace([None, math.inf, -math.inf], self._repl_val) from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence if self.embedding_name in ["glove", "en"]: self.embedding = WordEmbeddings(self.embedding_name) elif self.embedding_name in ["bert"]: self.embedding = BertEmbeddings() self.doc_embedding = DocumentPoolEmbeddings([self.embedding]) output = [] X = X.to_pandas() text1_arr = X.iloc[:, 0].values text2_arr = X.iloc[:, 1].values for ind, text1 in enumerate(text1_arr): try: text1 = Sentence(str(text1).lower()) self.doc_embedding.embed(text1) text2 = text2_arr[ind] text2 = Sentence(str(text2).lower()) self.doc_embedding.embed(text2) score = cosine_similarity(text1.get_embedding().reshape(1, -1), text2.get_embedding().reshape(1, -1))[0, 0] output.append(score) except: output.append(-99) return np.array(output)
def test_document_pool_embeddings(): (sentence, glove, charlm) = init_document_embeddings() for mode in [u'mean', u'max', u'min']: embeddings = DocumentPoolEmbeddings([glove, charlm], mode=mode) embeddings.embed(sentence) assert (len(sentence.get_embedding()) == 1124) sentence.clear_embeddings() assert (len(sentence.get_embedding()) == 0)
def test_document_pool_embeddings(): (sentence, glove, charlm) = init_document_embeddings() for mode in ['mean', 'max', 'min']: embeddings = DocumentPoolEmbeddings( [glove, charlm], pooling=mode, fine_tune_mode='none') embeddings.embed(sentence) assert (len(sentence.get_embedding()) == 1074) sentence.clear_embeddings() assert (len(sentence.get_embedding()) == 0)
def create_embeddings_flair(data: pd.DataFrame, column: str = "text", path: str = None, embeddings_type: str = "tranformer", typs: str = "train"): assert column in data.columns.tolist( ), "[embeddings.py] -> [create_embedding_flair] -> Input column not in dataframe columns" assert embeddings_type in ["tranformer", "stacked"] from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, DocumentPoolEmbeddings, TransformerDocumentEmbeddings from flair.data import Sentence fast_text_embedding = WordEmbeddings('de') flair_embedding_forward = FlairEmbeddings('de-forward') flair_embedding_backward = FlairEmbeddings('de-backward') stacked_embeddings = DocumentPoolEmbeddings([ fast_text_embedding, flair_embedding_forward, flair_embedding_backward ]) transformer_embedding = TransformerDocumentEmbeddings( 'bert-base-german-cased', fine_tune=False) tic = time.time() embeddings = [] for i, text in enumerate(data[column].values): print("sentence {}/{}".format(i, len(data))) sentence = Sentence(text) if embeddings_type == "stacked": stacked_embeddings.embed(sentence) elif embeddings_type == "tranformer": transformer_embedding.embed(sentence) embedding = sentence.embedding.detach().cpu().numpy() embeddings.append(embedding) embeddings = np.array(embeddings) columns = [ "embedding_{}".format(feature) for feature in range(embeddings.shape[1]) ] csv = pd.DataFrame(embeddings, columns=columns) csv.to_csv(path + embeddings_type + "_" + typs + ".csv", index=False) toc = time.time() print( "[create_embeddings_flair] -> [embeddings_type: {}, typs: {}] -> time {}'s" .format(embeddings_type, typs, toc - tic))
def get_sentence_embeddings(texts): word_embeddings = [BertEmbeddings('bert-large-uncased')] document_embeddings = DocumentPoolEmbeddings(word_embeddings) sentences = [Sentence(text) for text in texts] embeddings = [] for sentence in sentences: document_embeddings.embed(sentence) sentence_embedding = sentence.get_embedding().numpy().reshape(-1) embeddings.append(sentence_embedding) return np.array(embeddings)
def vectorize(string: str = None, selected_base_models: list = None): # 'vectorizes' the input string using one or a combination of word embeddings - if 'vector representation' # is being selected at Algorithms construction time. """ :param string, input string :param selected_base_models list of the models we want to use in order to create word embeddings :return: embedding """ if not selected_base_models: raise SystemExit(f"[ERROR]: function {vectorize.__name__}() -> Provide at least one base model: ['bert'," f"'roberta', 'glove', 'character', 'flair_forward', 'flair_backward']") embeddings = [] if 'bert' in selected_base_models: embeddings.append(bert_embedding) if 'roberta' in selected_base_models: embeddings.append(roberta_embedding) if 'glove' in selected_base_models: embeddings.append(glove_embedding) if 'character' in selected_base_models: embeddings.append(character_embeddings) if 'flair_forward' in selected_base_models: embeddings.append(flair_forward) if 'flair_backward' in selected_base_models: embeddings.append(flair_backward) # if none of the above, then the model combination passed is not supported if not embeddings: raise SystemExit(f"[ERROR]: function {vectorize.__name__}() -> {selected_base_models} not available. " f"Supported models: " f"['bert', 'roberta', 'glove', 'character', 'flair_forward', 'flair_backward']") # we are making use of Flair's API, DocumentPoolEmbeddings takes a list of embeddings to be combined # for a string containing whitespaces, it creates word embeddings for each word and then combines them. 'pooling' # parameter controls the way they are combined - We've chosen to combine them by taking their mean value stacked_embeddings = DocumentPoolEmbeddings(embeddings=embeddings, fine_tune_mode='none', pooling='mean') sentence = Sentence(string) # combination by concatenating each models' output (stackin) - thanks Flair API :) stacked_embeddings.embed(sentence) # detach() because if fine_tune_mode is set to, for exapme: 'linear', or 'non-linear', apart from the embedding # itself, it also returns the computed gradients of each layer (Pytorch's Autograd module - It records all the # perations that we are performing and replays it backward to compute gradients.) # we could tweak fine_tune_mode if wanted to fine tune the bese model further by training on external data sets return sentence.embedding.detach().numpy()
def other_embeddings(embd): sess = tf.InteractiveSession() train_data_list = [] test_data_list = [] if embd == 'glove': print('Starting Glove Embedding...') glove_embedding = WordEmbeddings('glove') document_embeddings = DocumentPoolEmbeddings( embeddings=[glove_embedding]) elif embd == 'xlnet': print('Starting XLNet Embedding...') xlnet_embedding = XLNetEmbeddings('xlnet-large-cased') document_embeddings = DocumentPoolEmbeddings( embeddings=[xlnet_embedding]) elif embd == 'fasttext': print('Starting Fasttext Embedding...') fasttext_embedding = WordEmbeddings('en') document_embeddings = DocumentPoolEmbeddings( embeddings=[fasttext_embedding]) elif embd == 'elmo': print('Starting ELMo Embedding...') elmo_embedding = ELMoEmbeddings() document_embeddings = DocumentPoolEmbeddings( embeddings=[elmo_embedding]) else: # init Flair embeddings flair_forward_embedding = FlairEmbeddings('multi-forward') flair_backward_embedding = FlairEmbeddings('multi-backward') glove_embedding = WordEmbeddings('glove') # now create the DocumentPoolEmbeddings object that combines all embeddings document_embeddings = DocumentPoolEmbeddings(embeddings=[ glove_embedding, flair_forward_embedding, flair_backward_embedding ]) print('Train embedding Started...') for text in final_train['text'].tolist(): text = Sentence(text) document_embeddings.embed(text) emb = text.get_embedding().detach().numpy() emb = tf.constant(emb).eval() train_data_list.append(emb) print('Embedded Train data!!') print('Test embedding Started...') for text in final_test['text'].tolist(): text = Sentence(text) document_embeddings.embed(text) emb = text.get_embedding().detach().numpy() emb = tf.constant(emb).eval() test_data_list.append(emb) print('Embedded Test data!!') return train_data_list, test_data_list
def is_difference_large(text1, text2): text1_preproccessed = Sentence(text1) text2_preproccessed = Sentence(text2) glove_embedding = WordEmbeddings('glove') document_embedding = DocumentPoolEmbeddings([glove_embedding]) document_embedding.embed(text1_preproccessed) document_embedding.embed(text2_preproccessed) text1_embedding = text1_preproccessed.get_embedding() text2_embedding = text2_preproccessed.get_embedding() text1_embedding = np.reshape(text1_embedding, (-1, 1)) text2_embedding = np.reshape(text2_embedding, (-1, 1)) similarity = cosine_similarity(text1_embedding, text2_embedding) print(np.mean(similarity))
class FlairGlove100Embed(BaseEmbed): def __init__(self, n_dims=100, make_unit_length=True): from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings super().__init__(n_dims=n_dims, make_unit_length=make_unit_length) embeddings = [WordEmbeddings('glove')] self.embeddings = DocumentPoolEmbeddings(embeddings, fine_tune_mode='none') self.log = getLogger(type(self).__name__) def get_sentence_vector(self, text): from flair.embeddings import Sentence sentence = Sentence(clean_text(text)) _ = self.embeddings.embed(sentence) a = sentence.get_embedding() result = a.detach().cpu().numpy() if np.sum(result[0:5]) == 0: result = np.random.randn(self.n_dims) return result def fit(self, feature: Feature, **kwargs): super().fit(feature, **kwargs) self.log.debug("End Fitting FlairEmbedding") return def transform(self, feature: Feature, **kwargs) -> np.ndarray: self.log.debug("Start Transform TextEmbedding...") outputs = np.vstack([np.array([self.get_sentence_vector(i) for i in t]).mean(0) if is_1d_array( t) else self.get_sentence_vector(t) for t in tqdm(feature)]) outputs = unit_length(outputs, axis=1) if self.make_unit_length else outputs self.log.debug("End Transform TextEmbedding") return self.check_output_dims(outputs, feature)
def createFlairEmbeddings(embedding_list, data): embeddings = [] sentences = data['Interest_Name'].values model = DocumentPoolEmbeddings(embedding_list, fine_tune_mode='nonlinear') if __name__ == "__main__": for sent in sentences: sentence = Sentence(sent) model.embed(sentence) modeled_embedding = sentence.get_embedding() array = modeled_embedding.cpu().detach().numpy() embeddings.append(array) return embeddings
class FlairGlove100AndBytePairEmbedding(ContentEmbeddingBase): def __init__(self, make_unit_length=True): super().__init__(n_dims=200, make_unit_length=make_unit_length) embeddings = [WordEmbeddings('glove'), BytePairEmbeddings('en')] self.embeddings = DocumentPoolEmbeddings(embeddings) self.log = getLogger(type(self).__name__) # noinspection PyUnresolvedReferences def get_sentence_vector(self, text): sentence = Sentence(clean_text(text)) _ = self.embeddings.embed(sentence) a = sentence.get_embedding() result = a.cpu().detach().numpy() if np.sum(result[0:5]) == 0: result = np.random.randn(self.n_dims) return result def fit(self, feature: Feature, **kwargs): super().fit(feature, **kwargs) assert feature.feature_type == FeatureType.STR self.log.debug("End Fitting FlairGlove100AndBytePairEmbedding for feature name %s", feature.feature_name) return def transform(self, feature: Feature, **kwargs) -> np.ndarray: self.log.debug("Start Transform FlairGlove100AndBytePairEmbedding for feature name %s", feature.feature_name) assert feature.feature_type == FeatureType.STR outputs = np.vstack([self.get_sentence_vector(t) for t in tqdm(feature.values)]) outputs = unit_length(outputs, axis=1) if self.make_unit_length else outputs self.log.debug("End Transform FlairGlove100AndBytePairEmbedding for feature name %s", feature.feature_name) return self.check_output_dims(outputs, feature)
def embed_tweet(tweetList): # initialize the word embeddings tr_embedding = WordEmbeddings('tr') char_embedding = CharacterEmbeddings() # initialize the document embeddings, mode = mean document_embeddings = DocumentPoolEmbeddings( [tr_embedding, char_embedding]) tweetTensors = [] for tweet in tweetList: #print(norm_tweet(tweet)) sentence = Sentence(norm_tweet(tweet)) document_embeddings.embed(sentence) tweetTensors.append(sentence.get_embedding().data) return tweetTensors
def flair_embeddings(x, *args): from flair.embeddings import DocumentPoolEmbeddings, DocumentRNNEmbeddings, Sentence word_embedders, aggregating_strategy, aggregating_params = args[0], args[ 1], args[2] embedding = None if aggregating_strategy == 'pooling': # TODO: check if kwargs work embedding = DocumentPoolEmbeddings(word_embedders, **aggregating_params) if aggregating_strategy == 'rnn': # TODO: check if kwargs work embedding = DocumentRNNEmbeddings(word_embedders, **aggregating_params) if embedding is None: raise KeyError("Insufficient vespine gas") sentence = Sentence(x) embedding.embed(sentence) return sentence.embedding.detach().numpy().reshape(-1, 1)
def compute_embedding(embedding, remove_punctuation: bool, file_name: str): """ Computes the embedding with given model for all arguments :param embedding: Model :param remove_punctuation: Bool to indicate if punctuation should be removed :param file_name: """ arguments = Arguments() document_embedding = DocumentPoolEmbeddings([embedding]) embedded_arguments = {} for argument in arguments.ground_truth_arguments: premises = argument['premises'] conclusion = argument['conclusion'] conclusion_text = conclusion['conclusion_text'] if remove_punctuation: conclusion_text = remove_punctuations(conclusion_text) conclusion_sentence = Sentence(conclusion_text) document_embedding.embed(conclusion_sentence) embedded_conclusion = conclusion_sentence.get_embedding().detach( ).numpy().tolist() embedded_premises = {} argument_uid = None for premise in premises: premise_text = premise[1] if remove_punctuation: premise_text = remove_punctuations(premise_text) premise_sentence = Sentence(premise_text) document_embedding.embed(premise_sentence) embedded_premise = premise_sentence.get_embedding().detach().numpy( ).tolist() embedded_premises[premise[2]] = embedded_premise argument_uid = premise[0] embedded_arguments[argument_uid] = [ embedded_conclusion, embedded_premises ] save_embedding(embedded_arguments, file_name)
def extract_bert_features(json_file,dataroot_folder,choice="yes_no",split="train"): questions=json.load(open(json_file))['questions'] question_ids=[quest['question_id'] for quest in questions] #questions=questions[0:10] bert=BertEmbeddings('bert-base-uncased') doc_bert=DocumentPoolEmbeddings([bert]) bert_embed_matrix=np.zeros((len(questions),3072)) print('Extracting bert features') for index,quest in tqdm(enumerate(questions)): sentence=Sentence(quest['question']) doc_bert.embed(sentence) bert_embed_matrix[index]=sentence.embedding.numpy() hdf5_file_path=os.path.join(dataroot_folder,split+'_bert_'+choice+'.hdf5') h5f = h5py.File(hdf5_file_path, 'w') h5f.create_dataset('bert_embeddings', data=bert_embed_matrix) h5f.create_dataset('question_ids', data=question_ids) h5f.close()
def flair_embed_docPool(sentence: str) -> Vector : """ Embed words with Flair's WordEmbeddings and DocumentPoolEmbeddings (for multi-words)""" from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings, FlairEmbeddings from flair.data import Sentence # init the word embeddings flair_embedding_forward = FlairEmbeddings('news-forward') # initialize the document embeddings, mode = mean document_embeddings = DocumentPoolEmbeddings([flair_embedding_forward]) # create an example sentence sentence = Sentence(sentence) # embed the sentence with our document embedding document_embeddings.embed(sentence) # now check out the embedded sentence. return sentence.get_embedding()
def is_difference_large(text1: str, text2: str) -> bool: text1_preprocessed = Sentence(text1) text2_preprocessed = Sentence(text2) glove_embedding = WordEmbeddings('glove') document_embedding = DocumentPoolEmbeddings([glove_embedding]) document_embedding.embed(text1_preprocessed) document_embedding.embed(text2_preprocessed) text1_embedding = text1_preprocessed.get_embedding() text2_embedding = text2_preprocessed.get_embedding() text1_embedding = np.reshape(text1_embedding, (-1, 1)) text2_embedding = np.reshape(text2_embedding, (-1, 1)) similarity = cosine_similarity(text1_embedding, text2_embedding) # Some example treshold # TODO: Determine a good threshold for example in pitch if np.mean(similarity) > 0.06: return False else: return True
def cosine_embedding(sentence1, sentence2, model): embeddings = DocumentPoolEmbeddings(model, mode='mean') s1 = Sentence(sentence1) s2 = Sentence(sentence2) e1 = embeddings.embed(s1) e2 = embeddings.embed(s2) v1 = s1.get_embedding() v2 = s2.get_embedding() #print(v1, v2) #check that you don't get empty tensors cos_sim = dot(v1, v2) / (norm(v1) * norm(v2)) #print(cos_sim) return cos_sim #lies in [-1, 1].
class InformedFeaturizer(ObservationFeaturizer): def __init__(self, device: str = "cpu"): self.device = device self._setup_device() self.doc_embeddings = DocumentPoolEmbeddings([WordEmbeddings("en")]) def init_on_reset(self, question: str, facts: List[str]): pass def featurize(self, observation: Observation) -> torch.Tensor: sim_scores = [self._get_sim(observation.get_question(), observation.get_choice()), self._get_sim(".".join(observation.get_facts()), observation.get_choice())] sim_scores = torch.tensor(sim_scores) return sim_scores def get_observation_dim(self): return 2 def _get_sentence_embedding(self, text: str) -> torch.Tensor: text = "..." if len(text) == 0 else text sent = Sentence(text) self.doc_embeddings.embed(sent) if len(sent) > 1: embedding = torch.tensor(sent.embedding.cpu().numpy()).reshape(1, -1) else: embedding = torch.tensor(sent[0].embedding.cpu().numpy()).reshape(1, -1) return embedding def _setup_device(self): import flair, torch flair.device = torch.device(self.device) def _get_sim(self, query: str, choice_text: str): sim = torch.nn.CosineSimilarity(dim=1)(self._get_sentence_embedding(query), self._get_sentence_embedding(choice_text)) return sim
pooling='mean', ) cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6) poss_sections = { '#gender': ['gender', 'sex', 'gentleman'], '#male': ['male', 'man', 'men'], '#female': ['female', 'woman', 'women'], } # substitute term by its embedding for list_candidates in tqdm(poss_sections.values(), desc='Embedding search terms'): for i in range(len(list_candidates)): sentence = Sentence(list_candidates[i].lower()) flair_emb.embed(sentence) list_candidates[i] = sentence.embedding sentence.clear_embeddings() dataset = [] print('Retrieving documents from database...') documents = Database.list_raw_documents() def create_emb(doc): batch_size = 60 dataset_section = [] for section_title, section_text in doc['raw']['sections'].items(): text = section_text.strip() # tokenize each sentence
def _embed_document(self, document_text: str, doc_embeddings: DocumentPoolEmbeddings): sentence = Sentence(document_text) doc_embeddings.embed(sentence) return sentence.get_embedding().data.cpu().numpy()
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence, OneHotEmbeddings, \ DocumentRNNEmbeddings # initialize the word embeddings glove_embedding = WordEmbeddings('glove') flair_embedding_forward = FlairEmbeddings('news-forward') flair_embedding_backward = FlairEmbeddings('news-backward') # embeddings = OneHotEmbeddings(corpus) glove_embedding = WordEmbeddings('glove') # initialize the document embeddings, mode = mean document_embeddings = DocumentPoolEmbeddings( [glove_embedding], # flair_embedding_backward, flair_embedding_forward], # pooling='min', fine_tune_mode='nonlinear') document_embeddings = DocumentRNNEmbeddings([glove_embedding]) document_lstm_embeddings = DocumentRNNEmbeddings([glove_embedding], rnn_type='LSTM') # create an example sentence sentence = Sentence('The grass is green . And the sky is blue .') # embed the sentence with our document embedding document_embeddings.embed(sentence) # now check out the embedded sentence. print(sentence.get_embedding())
class EasyDocumentEmbeddings: """ Document Embeddings generated by pool and rnn methods applied to the word embeddings of text Usage: ```python >>> embeddings = adaptnlp.EasyDocumentEmbeddings("bert-base-cased", "xlnet-base-cased", methods["rnn"]) ``` **Parameters:** * ***embeddings** - Non-keyword variable number of strings referring to model names or paths * **methods** - A list of strings to specify which document embeddings to use i.e. ["rnn", "pool"] (avoids unncessary loading of models if only using one) * **configs** - A dictionary of configurations for flair's rnn and pool document embeddings ```python >>>example_configs = {"pool_configs": {"fine_tune_mode": "linear", "pooling": "mean", }, ... "rnn_configs": {"hidden_size": 512, ... "rnn_layers": 1, ... "reproject_words": True, ... "reproject_words_dimension": 256, ... "bidirectional": False, ... "dropout": 0.5, ... "word_dropout": 0.0, ... "locked_dropout": 0.0, ... "rnn_type": "GRU", ... "fine_tune": True, }, ... } ``` """ __allowed_methods = ["rnn", "pool"] __allowed_configs = ("pool_configs", "rnn_configs") def __init__( self, *embeddings: str, methods: List[str] = ["rnn", "pool"], configs: Dict = { "pool_configs": { "fine_tune_mode": "linear", "pooling": "mean" }, "rnn_configs": { "hidden_size": 512, "rnn_layers": 1, "reproject_words": True, "reproject_words_dimension": 256, "bidirectional": False, "dropout": 0.5, "word_dropout": 0.0, "locked_dropout": 0.0, "rnn_type": "GRU", "fine_tune": True, }, }, ): print("May need a couple moments to instantiate...") self.embedding_stack = [] # Check methods for m in methods: assert m in self.__class__.__allowed_methods # Set configs for pooling and rnn parameters for k, v in configs.items(): assert k in self.__class__.__allowed_configs setattr(self, k, v) # Load correct Embeddings module for model_name_or_path in embeddings: if "bert" in model_name_or_path and "roberta" not in model_name_or_path: self.embedding_stack.append(BertEmbeddings(model_name_or_path)) elif "roberta" in model_name_or_path: self.embedding_stack.append( RoBERTaEmbeddings(model_name_or_path)) elif "gpt2" in model_name_or_path: self.embedding_stack.append( OpenAIGPT2Embeddings(model_name_or_path)) elif "xlnet" in model_name_or_path: self.embedding_stack.append( XLNetEmbeddings(model_name_or_path)) elif "xlm" in model_name_or_path: self.embedding_stack.append(XLMEmbeddings(model_name_or_path)) elif ("flair" in model_name_or_path or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES): self.embedding_stack.append( FlairEmbeddings(model_name_or_path)) else: try: self.embedding_stack.append( WordEmbeddings(model_name_or_path)) except ValueError: raise ValueError( f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model" ) assert len(self.embedding_stack) != 0 if "pool" in methods: self.pool_embeddings = DocumentPoolEmbeddings( self.embedding_stack, **self.pool_configs) print("Pooled embedding loaded") if "rnn" in methods: self.rnn_embeddings = DocumentRNNEmbeddings( self.embedding_stack, **self.rnn_configs) print("RNN embeddings loaded") def embed_pool( self, text: Union[List[Sentence], Sentence, List[str], str], ) -> List[Sentence]: """ Stacked embeddings * **text** - Text input, it can be a string or any of Flair's `Sentence` input formats **return** - A list of Flair's `Sentence`s """ if isinstance(text, str): sentences = [Sentence(text)] elif isinstance(text, list) and all(isinstance(t, str) for t in text): sentences = [Sentence(t) for t in text] elif isinstance(text, Sentence): sentences = [text] else: sentences = text self.pool_embeddings.embed(sentences) return sentences def embed_rnn( self, text: Union[List[Sentence], Sentence, List[str], str], ) -> List[Sentence]: """ Stacked embeddings * **text** - Text input, it can be a string or any of Flair's `Sentence` input formats **return** - A list of Flair's `Sentence`s """ if isinstance(text, str): sentences = [Sentence(text)] elif isinstance(text, list) and all(isinstance(t, str) for t in text): sentences = [Sentence(t) for t in text] elif isinstance(text, Sentence): sentences = [text] else: sentences = text self.rnn_embeddings.embed(sentences) return sentences
def main(): print("Instantiate embeddings") embeddings = DocumentPoolEmbeddings([ FlairEmbeddings("pubmed-forward"), FlairEmbeddings("pubmed-backward"), ELMoEmbeddings("pubmed"), BertEmbeddings("bert-large-uncased"), ]) print("Load pubmed_data") pubmed_data = pd.concat([ pd.read_json(f"data/medline/{medline_file}") for medline_file in ["medline_2016.json", "medline_2017.json", "medline_2018.json"] ]) print("pubmed_corpus") pubmed_corpus = [ try_sentence(text) for text in pubmed_data.title.apply(preproc).head(10_000) ] pubmed_corpus = [ text for text in pubmed_corpus if text ] print(pubmed_corpus[0:5]) print("query") query = [ Sentence(text) for text in [ "Searching for the causal effects of body mass index in over 300 000 participants in UK Biobank, using Mendelian randomization.", "Prioritizing putative influential genes in cardiovascular disease susceptibility by applying tissue-specific Mendelian randomization.", "Longitudinal analysis strategies for modelling epigenetic trajectories", "FATHMM-XF: accurate prediction of pathogenic point mutations via extended features", "PhenoSpD: an integrated toolkit for phenotypic correlation estimation and multiple testing correction using GWAS summary statistics.", "LD Hub: a centralized database and web interface to perform LD score regression that maximizes the potential of summary level GWAS data for SNP heritability and genetic correlation analysis.", "MELODI: Mining Enriched Literature Objects to Derive Intermediates", "The MR-Base platform supports systematic causal inference across the human phenome", ] ] print("Embed") for text in query + pubmed_corpus: embeddings.embed(text) print("Calculate scores") cos = torch.nn.CosineSimilarity(dim=0, eps=1e-5) cos_scores = [] for query_id, query_text in enumerate(query): cos_res = [ { "query_id": query_id, "target_id": target_id, "score": cos(query_text.embedding, target_text.embedding).item() } for target_id, target_text in enumerate(pubmed_corpus) ] cos_scores.append(cos_res) cos_scores = pd.concat(pd.DataFrame(x) for x in cos_scores) print(cos_scores) n = 5 for query_id, query_text in enumerate(query): print(f"# Query {query_id}") print(query_text) top_n = ( cos_scores .query(f"query_id == {query_id}") .sort_values("score", ascending=False) .head(n) ) for target_id, target_score in zip(top_n.target_id, top_n.score): print(f" ## Candidate {target_id}, score {target_score}") print(f" {pubmed_corpus[target_id]}\n") print("\n\n")
class EmbeddingSimilarityTransformer(CustomTransformer): _modules_needed_by_name = [ "gensim==3.8.0", 'regex==2019.12.17', 'flair==0.4.1', 'segtok==1.5.7' ] _is_reproducible = False _can_use_gpu = True _repl_val = 0 _testing_can_skip_failure = False # ensure tested as if shouldn't fail def __init__(self, embedding_name, **kwargs): super().__init__(**kwargs) self.embedding_name = embedding_name @staticmethod def is_enabled(): return False # sometimes package flair has issues installing @staticmethod def can_use(accuracy, interpretability, **kwargs): """Uses all GPU memory - can lead to OOM failures in combination with other GPU-based transformers""" return False @staticmethod def get_default_properties(): return dict(col_type="text", min_cols=2, max_cols=2, relative_importance=1) @staticmethod def get_parameter_choices(): return {"embedding_name": ["glove", "en", "bert"]} @property def display_name(self): name_map = {"glove": "Glove", "en": "FastText", "bert": "BERT"} return "%sEmbedding_CosineSimilarity" % name_map[self.embedding_name] def fit_transform(self, X: dt.Frame, y: np.array = None): X.replace([None, math.inf, -math.inf], self._repl_val) return self.transform(X) def transform(self, X: dt.Frame): X.replace([None, math.inf, -math.inf], self._repl_val) from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence if self.embedding_name in ["glove", "en"]: self.embedding = WordEmbeddings(self.embedding_name) elif self.embedding_name in ["bert"]: self.embedding = BertEmbeddings() self.doc_embedding = DocumentPoolEmbeddings([self.embedding]) output = [] X = X.to_pandas() text1_arr = X.iloc[:, 0].values text2_arr = X.iloc[:, 1].values for ind, text1 in enumerate(text1_arr): try: text1 = Sentence(str(text1).lower()) self.doc_embedding.embed(text1) text2 = text2_arr[ind] text2 = Sentence(str(text2).lower()) self.doc_embedding.embed(text2) score = cosine_similarity(text1.get_embedding().reshape(1, -1), text2.get_embedding().reshape(1, -1))[0, 0] output.append(score) except: output.append(-99) return np.array(output)