import syntok.segmenter as segmenter document = Document() ## Create a python-docx document cos = CosineSimilarity(dim=1, eps=1e-6) sent_level = False dynamic = True graph = False doc_embeddings = [] scores = [] stacked_embeddings = DocumentPoolEmbeddings([ WordEmbeddings('en'), #WordEmbeddings('glove'), #WordEmbeddings('extvec'),#ELMoEmbeddings('original'), #BertEmbeddings('bert-base-cased'), #FlairEmbeddings('news-forward-fast'), #FlairEmbeddings('news-backward-fast'), #OpenAIGPTEmbeddings() #TransformerXLEmbeddings() ]) #, mode='max') def set_card(): print("Input the Card Text, press Ctrl-D to end text entry") card = sys.stdin.read() #input("Input the Card Text: ") card_tag = input( "Input the card_tag, or a -1 to summarize in-terms of the card itself: " ) card = str(card) if str( card_tag
"n_estimators": [50, 100, 200], "max_depth": [15, 25, 35] }), "Logistic Regression": (LogisticRegression(solver="saga", multi_class="multinomial"), { "penalty": ["l2", "l1"] }), "Naive Bayes": (MultinomialNB(), { "alpha": [0.15, 0.25, 0.5, 0.65], }), "SVM": (SGDClassifier(loss='hinge', alpha=0.001, random_state=42), { "alpha": [0.0005, 0.001, 0.005, 0.01], }), } _EMBEDDER = DocumentPoolEmbeddings([WordEmbeddings("glove")], "mean") _INPUT_OPTIONS = { "TFIDF": (False, False), "GloveExtraFeatures": (True, True), "Glove": (True, False), } def read_data(set_): return np.load(_DATA_PATH / f"data_{set_}", allow_pickle=True) def preprocess(X, lem=True, stem=True,
import pickle import torch from flair.data import Sentence from flair.embeddings import BertEmbeddings, DocumentPoolEmbeddings print(torch.cuda.get_device_name(torch.cuda.current_device())) print(torch.cuda.is_available()) src = open("./labeled_news.reversed.csv", 'r', newline='', encoding="utf-8") src_reader = csv.reader(src, delimiter=",", quotechar="|") dst = open("./4bert_vectors.pickle", "wb") # init embedding embedding = BertEmbeddings('bert-base-multilingual-cased') document_embeddings = DocumentPoolEmbeddings([embedding]) def getBertVector(str): # create a sentence #print(str) sentence = Sentence(str) # embed words in sentence document_embeddings.embed(sentence) # print(str) # print(sentence.get_embedding().detach().numpy()) return sentence.get_embedding().detach().numpy()
class FlairTextEncoder(BaseTorchEncoder): """ Encode an array of string in size `B` into an ndarray in size `B x D` The ndarray potentially is BatchSize x (Channel x Height x Width) Internally, :class:`FlairTextEncoder` wraps the DocumentPoolEmbeddings from Flair. :param embeddings: the name of the embeddings. Supported models include - ``word:[ID]``: the classic word embedding model, the ``[ID]`` are listed at https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md - ``flair:[ID]``: the contextual embedding model, the ``[ID]`` are listed at https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/FLAIR_EMBEDDINGS.md - ``pooledflair:[ID]``: the pooled version of the contextual embedding model, the ``[ID]`` are listed at https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/FLAIR_EMBEDDINGS.md - ``byte-pair:[ID]``: the subword-level embedding model, the ``[ID]`` are listed at https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md - ``Example``: ('word:glove', 'flair:news-forward', 'flair:news-backward') :param pooling_strategy: the strategy to merge the word embeddings into the chunk embedding. Supported strategies include ``mean``, ``min``, ``max``. """ def __init__(self, embeddings: Union[Tuple[str], List[str]] = ('word:glove', ), pooling_strategy: str = 'mean', *args, **kwargs): super().__init__(*args, **kwargs) self.embeddings = embeddings self.pooling_strategy = pooling_strategy self.max_length = -1 # reserved variable for future usages self._post_set_device = False def post_init(self): """ Load model. Possible models are: - flair - pooledflair - word - byte-pair """ import flair flair.device = self.device embeddings_list = [] for e in self.embeddings: model_name, model_id = e.split(':', maxsplit=1) emb = None try: if model_name == 'flair': from flair.embeddings import FlairEmbeddings emb = FlairEmbeddings(model_id) elif model_name == 'pooledflair': from flair.embeddings import PooledFlairEmbeddings emb = PooledFlairEmbeddings(model_id) elif model_name == 'word': from flair.embeddings import WordEmbeddings emb = WordEmbeddings(model_id) elif model_name == 'byte-pair': from flair.embeddings import BytePairEmbeddings emb = BytePairEmbeddings(model_id) except ValueError: self.logger.error(f'embedding not found: {e}') continue if emb is not None: embeddings_list.append(emb) if embeddings_list: from flair.embeddings import DocumentPoolEmbeddings self.model = DocumentPoolEmbeddings(embeddings_list, pooling=self.pooling_strategy) self.logger.info(f'flair encoder initialized with embeddings: {self.embeddings}') else: self.logger.error('flair encoder initialization failed.') def encode(self, content: 'np.ndarray', *args, **kwargs) -> 'np.ndarray': """ Encode ``Document`` content from an array of string in size `B` into a ndarray in size `B x D`. :param content: a 1-dimension array of string type in size `B` :return: an ndarray in size `B x D` """ from flair.data import Sentence c_batch = [Sentence(row) for row in content] self.model.embed(c_batch) result = [self.tensor2array(c_text.embedding) for c_text in c_batch] return np.vstack(result) def tensor2array(self, tensor): if isinstance(tensor, np.ndarray): return tensor return tensor.cpu().numpy() if self.on_gpu else tensor.numpy()
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence import torch.nn as nn #flair_embedding_forward = FlairEmbeddings('news-forward') #flair_embedding_backward = FlairEmbeddings('news-backward') fasttext_embeddings_web = WordEmbeddings('en-crawl') fasttext_embeddings_news = WordEmbeddings('en-news') embedding = DocumentPoolEmbeddings( [fasttext_embeddings_news, fasttext_embeddings_web]) cos = nn.CosineSimilarity(dim=0) def _get_embedding(text): sentence = Sentence(text) embedding.embed(sentence) vector = sentence.get_embedding() return vector def _get_cosine_similarity(vec_1, vec_2): return round(cos(vec_1, vec_2).item(), 3) def get_embedding_similarity(sentence_1, sentence_2): vec_1 = _get_embedding(sentence_1) vec_2 = _get_embedding(sentence_2) return _get_cosine_similarity(vec_1, vec_2)
def generate_embeddings(docs, batch_size, model_name='bert-base-cased', pooling='mean', offset=0): """ Generator function for generating embeddings from strings using a flair model. Takes a list of sentences and returns a list tuple. The first element represents failure (0) or success (1 or 2) and the second element contains a list of embeddings as numpy arrays if successful, and the indices of the failed batch if unsuccessful. The first element is 1, if batch_size embeddings were created :param docs: a list of strings for which embeddings should be created :param batch_size: integer representing how many embeddings should be created at once :param model_name: the model for creating the embeddings. Defaults to document embeddings using BERT-Base :param pooling: the pooling strategy to generate Document Embeddings :param offset: the offset of the integers, for printing out the correct index :return: a tuple (success/failure, embeddings/failed_indices) """ rest = len(docs) % batch_size model = False if pooling == 'mean': embedding = TransformerWordEmbeddings(model_name, layers='-1', allow_long_sentences=True) model = DocumentPoolEmbeddings([embedding], fine_tune_mode='none') elif pooling == 'CLS': model = TransformerDocumentEmbeddings(model_name) if model: for i in range(0, len(docs) - rest, batch_size): sentences = [ Sentence(sentence) for sentence in docs[i:i + batch_size] ] try: model.embed(sentences) print( f'successfully embedded sentences {offset + i} to {offset + i + batch_size-1}' ) yield 1, [ sentence.get_embedding().detach().cpu().numpy() for sentence in sentences ] except RuntimeError: print( f'could not embed sentences with index {offset + i} ' f'to {offset + i + batch_size-1}\nstoring in failed index list' ) yield 0, (offset + i, offset + i + batch_size - 1) if rest: sentences = [Sentence(sentence) for sentence in docs[-rest:]] try: model.embed(sentences) print( f'successfully embedded sentences from {len(docs) + offset - rest} to the end' ) yield 1, [ sentence.get_embedding().detach().cpu().numpy() for sentence in sentences ] except RuntimeError: yield 0, (len(docs) - rest, 0) elif pooling == 'SentenceBert': model = SentenceTransformer(model_name) for i in range(0, len(docs) - rest, batch_size): try: embeddings = model.encode(docs[i:i + batch_size]) print( f'successfully embedded sentences {offset + i} to {offset + i + batch_size-1}' ) yield 1, embeddings except RuntimeError: print( f'could not embed sentences with index {offset + i} ' f'to {offset + i + batch_size-1}\nstoring in failed index list' ) yield 0, (offset + i, offset + i + batch_size - 1) if rest: try: embeddings = model.encode(docs[-rest:]) print( f'successfully embedded sentences from {len(docs) + offset - rest} to the end' ) yield 1, embeddings except RuntimeError: yield 0, (len(docs) - rest, 0) else: raise Exception("No Valid model")
from WorkforceSentimentMonitoring.data import get_data, merge, drop_wrong_language from flair.data import Sentence from flair.embeddings import FlairEmbeddings, DocumentPoolEmbeddings from flair.data import Sentence import numpy as np import swifter import pickle import os import numpy as np import pandas as pd embedder = DocumentPoolEmbeddings([ FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ]) def embed(text, embedder): sentence = Sentence(text) embedder.embed(sentence) return sentence.get_embedding().detach().numpy() if __name__ == '__main__': # submission, train, test = get_data() # df = merge(submission, train, test) # df = drop_wrong_language(df, "review") # path = os.path.split(os.path.abspath('__file__'))[0] # file = os.path.join(path, 'pickle_files/reviews_eng.p') # with open(file, 'wb') as f:
from textblob import TextBlob import numpy as np from multiprocessing import Process import nltk import nltk.data from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence, BertEmbeddings from spacy.gold import biluo_tags_from_offsets import matplotlib.pyplot as plt from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import linear_kernel, euclidean_distances from sklearn.metrics.pairwise import cosine_similarity as cosinSim nlp = spacy.load("en_core_web_sm") embeddings = WordEmbeddings('glove') EMBEDDING_DIM = 100 document_embeddings = DocumentPoolEmbeddings([embeddings], pooling='max') tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=1, stop_words='english') def get_token_offset(span, paragraph, answer_start, answer_end): p_doc = nlp(paragraph) paragraph_tokens = [token.text for token in p_doc] span_doc = nlp(span) span_tokens = [token.text for token in span_doc] [(start, end) ] = [(i, i + len(span_tokens)) for i in range(len(paragraph_tokens)) if (paragraph_tokens[i] == span_tokens[0] and paragraph_tokens[i:i + len(span_tokens)] == span_tokens)]
class FlairTransformer(BaseEstimator, TransformerMixin): """ a general class for creating a machine learning step in the machine learning pipeline """ def __init__( self, embeddings: List[TokenEmbeddings], fine_tune_mode="linear", pooling: str = "mean", batch_size=32, ): """ constructor """ super(FlairTransformer, self).__init__() self.embedder = DocumentPoolEmbeddings(embeddings=embeddings, fine_tune_mode=fine_tune_mode, pooling=pooling) self.batch_size = batch_size self.vector_cache = {} self.dataset_cache = {} def fit(self, X, y=None, **kwargs): """ an abstract method that is used to fit the step and to learn by examples :param X: features - Dataframe :param y: target vector - Series :param kwargs: free parameters - dictionary :return: self: the class object - an instance of the transformer - Transformer """ # No fitting needed, using pre-trained embeddings_baseline return self def transform(self, X, y=None, **kwargs): """ an abstract method that is used to transform according to what happend in the fit method :param X: features - Dataframe :param y: target vector - Series :param kwargs: free parameters - dictionary :return: X: the transformed data - Dataframe """ X = X['text'] dataset_hash = hash(str(X) + str(self.embedder.__dict__)) if dataset_hash in self.dataset_cache: return self.dataset_cache[dataset_hash] else: embeddings = [] for first in trange(0, len(X), self.batch_size): subset = X[first:first + self.batch_size] sentences = [] for element in subset: sentence = Sentence(element) # sentence.tokens = sentence.tokens[:200] sentences.append(sentence) self.embedder.embed(sentences) for sentence in sentences: key = sentence.to_original_text() if key in self.vector_cache.keys(): vector = self.vector_cache[key] else: vector = sentence.get_embedding().cpu().detach().numpy( ) self.vector_cache[key] = vector embeddings.append(vector) embedding_dataset = numpy.vstack(embeddings) self.dataset_cache[dataset_hash] = embedding_dataset return embedding_dataset def fit_transform(self, X, y=None, **kwargs): """ perform fit and transform over the data :param X: features - Dataframe :param y: target vector - Series :param kwargs: free parameters - dictionary :return: X: the transformed data - Dataframe """ return self.transform(X, y)
def train_model(self, model_name="text_classification_model", custom_word_embeddings=None, rnn_type="GRU", use_pool_embedding=False, hidden_size=16, reproject_words=True, reproject_words_dimension=128, learning_rate=1e-3, batch_size=8, anneal_factor=0.5, patience=2, max_epochs=30, **kwargs): """ Train flair model and save it in your data folder Parameters ---------- model_name: str Name of your model custom_word_embeddings: list<embedding> Use custom flair embedding See more in flair documentation: https://github.com/zalandoresearch/flair/tree/master/resources/docs Return ------- None """ self.model_name = model_name corpus = CSVClassificationCorpus(self.data_folder, self.column_name_map, skip_header=True) label_dict = corpus.make_label_dictionary() # Word embedding selection if custom_word_embeddings is None: word_embeddings = [WordEmbeddings('fr')] else: word_embeddings = custom_word_embeddings # initialize document embedding by passing list of word embeddings and parameters if use_pool_embedding: document_embeddings = DocumentPoolEmbeddings( word_embeddings, pooling='max', fine_tune_mode='nonlinear') else: document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=hidden_size, reproject_words=reproject_words, reproject_words_dimension=reproject_words_dimension, rnn_type=rnn_type) # create the text classifier and initialize trainer classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) trainer = ModelTrainer(classifier, corpus, optimizer=Adam) # let's train ! num_workers = cpu_count() trainer.train("{0}\\{1}".format(self.data_folder, self.model_name), learning_rate=learning_rate, num_workers=num_workers, mini_batch_size=batch_size, anneal_factor=anneal_factor, patience=patience, max_epochs=max_epochs, **kwargs)
""" ''' prerequisites : flair (pip install flair) translate the hindi text into english or viceversa E_sentences - glossary file of english text book H_sentences_eng - translated glossary file of hindi text book ''' from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence document_embeddings = DocumentPoolEmbeddings([ WordEmbeddings('glove'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ]) #opening english file and embedding each sentence eng = open('<PATH> /E_sentences.txt', 'r') #vec_eng=open('/home/dheeraj/Desktop/IIITH-Intern/Major-TH-Tool/glossary/E_vecs.txt','a') line = eng.readline() eng_vecs = [] while (line): sentence = Sentence(line) document_embeddings.embed(sentence) li = sentence.get_embedding() li = li.tolist() eng_vecs.append(li) line = eng.readline()
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence from flair.data import Sentence # initialize the word embeddings glove_embedding = WordEmbeddings('glove') flair_embedding_forward = FlairEmbeddings('news-forward') flair_embedding_backward = FlairEmbeddings('news-backward') # initialize the document embeddings, mode = mean document_embeddings = DocumentPoolEmbeddings( [glove_embedding, flair_embedding_backward, flair_embedding_forward]) # create an example sentence sentence = Sentence('The grass is green . And the sky is blue .') # embed the sentence with our document embedding document_embeddings.embed(sentence) # now check out the embedded sentence. print(sentence.get_embedding()) document_embeddings = DocumentPoolEmbeddings( [glove_embedding, flair_embedding_backward, flair_embedding_backward], mode='min') from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings glove_embedding = WordEmbeddings('glove') document_lstm_embeddings = DocumentRNNEmbeddings([glove_embedding], rnn_type='LSTM')
import torch # Some parameters SCORE_THRESHOLD = .6 # [0., 1.] The minimum value for an aceptable question WINDOW_LENGTH = 128 # Character window length QG_BEAM_SIZE = 3 # Beam-size used on question generation decoder # # tagger = SequenceTagger.load('ner-ontonotes') ne = Vocabulary.from_vocab_file('vocabularies/biology.vocab').compile() qg = QuestionGenerator('pretrained_models/qg_model.bin', beam_size=QG_BEAM_SIZE) qa = pipeline('question-answering') # initialize the document embeddings, mode = mean document_embeddings = DocumentPoolEmbeddings([WordEmbeddings('glove'), FlairEmbeddings('news-backward'), FlairEmbeddings('news-forward')]) remove_punct = re.compile(r"[\(\)\'\":?¿!¡;]") def answer_similarity(ans1, real): sent1 = Sentence(ans1) sent2 = Sentence(real) document_embeddings.embed(sent1) document_embeddings.embed(sent2) emb1 = sent1.get_embedding() emb2 = sent2.get_embedding() emb1 /= torch.sqrt((emb1**2).sum()) emb2 /= torch.sqrt((emb2**2).sum())
class Embeddings(BaseMatcher): """ Embed words into vectors and use cosine similarity to find the best matches between two lists of strings Arguments: embedding_method: list of Flair embeddings to use min_similarity: The minimum similarity between strings, otherwise return 0 similarity cosine_method: The method/package for calculating the cosine similarity. Options: "sparse", "sklearn", "knn". Sparse is the fastest and most memory efficient but requires a package that might be difficult to install. Sklearn is a bit slower than sparse and requires significantly more memory as the distance matrix is not sparse Knn uses 1-nearest neighbor to extract the most similar strings it is significantly slower than both methods but requires little memory model_id: The name of the particular instance, used when comparing models Usage: ```python model = Embeddings(min_similarity=0.5) ``` Or if you want a custom model to be used and it is a word embedding model, pass it in as a list: ```python embedding_model = WordEmbeddings('news') model = Embeddings([embeddings_model], min_similarity=0.5) ``` As you might have guessed, you can pass along multiple word embedding models and the results will be averaged: ```python fasttext_embedding = WordEmbeddings('news') glove_embedding = WordEmbeddings('glove') bert_embedding = TransformerWordEmbeddings('bert-base-multilingual-cased') model = Embeddings([glove_embedding, fasttext_embedding, bert_embedding ], min_similarity=0.5) ``` """ def __init__(self, embedding_method: Union[List, None] = None, min_similarity: float = 0.75, cosine_method: str = "sparse", model_id: str = None): super().__init__(model_id) self.type = "Embeddings" if not embedding_method: self.document_embeddings = DocumentPoolEmbeddings( [WordEmbeddings('news')]) elif isinstance(embedding_method, list): self.document_embeddings = DocumentPoolEmbeddings(embedding_method) elif isinstance(embedding_method, TokenEmbeddings): self.document_embeddings = DocumentPoolEmbeddings( [embedding_method]) else: self.document_embeddings = embedding_method self.min_similarity = min_similarity self.cosine_method = cosine_method def match(self, from_list: List[str], to_list: List[str], embeddings_from: np.ndarray = None, embeddings_to: np.ndarray = None) -> pd.DataFrame: """ Matches the two lists of strings to each other and returns the best mapping Arguments: from_list: The list from which you want mappings to_list: The list where you want to map to embeddings_from: Embeddings you created yourself from the `from_list` embeddings_to: Embeddings you created yourself from the `to_list` Returns: matches: The best matches between the lists of strings Usage: ```python model = Embeddings(min_similarity=0.5) matches = model.match(["string_one", "string_two"], ["string_three", "string_four"]) ``` """ if not isinstance(embeddings_from, np.ndarray): embeddings_from = self._embed(from_list) if not isinstance(embeddings_to, np.ndarray): embeddings_to = self._embed(to_list) matches = cosine_similarity(embeddings_from, embeddings_to, from_list, to_list, self.min_similarity, self.cosine_method) return matches def _embed(self, strings: List[str]) -> np.ndarray: """ Create embeddings from a list of strings """ embeddings = [] for name in strings: sentence = Sentence(name) self.document_embeddings.embed(sentence) embeddings.append(sentence.embedding.cpu().numpy()) return np.array(normalize(embeddings), dtype="double")
def __create_models(self): models = [] models_fit = [] #for _params in self.model_params: _params = {} for k, v in self.params.items(): if k.startswith('_'): continue _params[k] = v self.textModels = dict( mtc=TextModel(_params).fit(self.train), #charEmb=DocumentPoolEmbeddings([CharacterEmbeddings()]), #charLangEmb=DocumentPoolEmbeddings([CharacterEmbeddings(),BytePairEmbeddings(self.lang)]), ##charMultiEmb=DocumentPoolEmbeddings([CharacterEmbeddings(),BytePairEmbeddings('multi')]), langEmb=DocumentPoolEmbeddings([BytePairEmbeddings(self.lang)]), charLangMultiEmb=DocumentPoolEmbeddings([ CharacterEmbeddings(), BytePairEmbeddings(self.lang), BytePairEmbeddings('multi') ]), langMultiEmb=DocumentPoolEmbeddings( [BytePairEmbeddings(self.lang), BytePairEmbeddings('multi')]), bytePairEMB=DocumentPoolEmbeddings([BytePairEmbeddings('multi')]), #flairEmbF=DocumentPoolEmbeddings([FlairEmbeddings('multi-forward')]), #flairEmbB=DocumentPoolEmbeddings([FlairEmbeddings('multi-backward')]), #bertEMB=DocumentPoolEmbeddings([TransformerWordEmbeddings('bert-base-uncased', layers='-1')]) ) for km, tmodel in self.textModels.items(): models.append({'name': km}) models_fit.append({'name': km}) if km == 'mtc': xt = tmodel.transform(self.train) xv = tmodel.transform(self.validation) X = tmodel.transform(self.data) else: sentences_train = [Sentence(txt) for txt in self.train] tmodel.embed(sentences_train) xt = np.array([ e.get_embedding().cpu().detach().numpy() for e in sentences_train ]) sentences_val = [Sentence(txt) for txt in self.validation] tmodel.embed(sentences_val) xv = np.array([ e.get_embedding().cpu().detach().numpy() for e in sentences_val ]) sentences = [Sentence(txt) for txt in self.data] tmodel.embed(sentences) X = np.array([ e.get_embedding().cpu().detach().numpy() for e in sentences ]) models[-1]['xv'] = xv models[-1]['xt'] = xt models_fit[-1]['xt'] = X #max_iter=5000 #if km=='mtc': max_iter=1000 #if km=='langMulti': max_iter=5000 #self.models[-1]['clf']=LinearSVC(max_iter=max_iter).fit(xt,self.yt) #yp=self.models[-1]['clf'].decision_function(xv) #scaler=Normalizer().fit(yp) #self.models[-1]['macroF1']=f1_score(self.yv,np.argmax(scaler.transform(yp),axis=1),average='weighted') #self.models[-1]['weightedF1']=f1_score(self.yv,np.argmax(scaler.transform(yp),axis=1),average='weighted') #self.models[-1]['score']=f1_score(self.yv,np.argmax(yp,axis=1),average='weighted') #self.models[-1]['probas']=scaler.transform(yp) ### Fit model with all avaliable data #self.models_fit[-1]['clf']=LinearSVC(max_iter=max_iter).fit(X,self.y) print('Fitting Ensemble') #self.models = Parallel(n_jobs=5)(delayed(self._train_model)(md) for md in models) #self.models_fit = Parallel(n_jobs=5)(delayed(self._train_model)(md) for md in models_fit) self.models, self.models_fit = [], [] for md, mdf in zip(models, models_fit): self.models.append(self._train_model( md)) # = [self._train_model(md) for md in models] self.models_fit.append(self._train_model(md))
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import stopwords from random import randrange from scipy.sparse import coo_matrix from selenium import webdriver from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from time import time from tqdm import tqdm import json import pandas as pd import string import torch # choosing fasttext because for subword information embedding = DocumentPoolEmbeddings([WordEmbeddings('en')]) def text_scraper(urls, file): """function for scraping the text of a webpage given url""" start = time() print('SCRAPING WEBPAGES...') # creating a new instance of google chrome driver = webdriver.Chrome('./chromedriver') pages = [] for url in tqdm(urls): driver.get(url) # extracting the title, content and date title = driver.find_element_by_tag_name('h1').text content = driver.find_element_by_tag_name('body').text
import torch import torch.nn as nn import numpy as np import nltk from torch.utils.data import Dataset, DataLoader from torch.autograd import Variable from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence # initialize the word embeddings flair_embedding_forward = FlairEmbeddings('news-forward') flair_embedding_backward = FlairEmbeddings('news-backward') # initialize the document embeddings, mode = mean document_embeddings = DocumentPoolEmbeddings([flair_embedding_backward, flair_embedding_forward]) # Hyper Parameters BATCH_SIZE = 16 class ContrastiveLoss(torch.nn.Module): def __init__(self, margin=1.0): super(ContrastiveLoss, self).__init__() self.margin = margin def forward(self, input1, input2, y): diff = input1 - input2 dist_sq = torch.sum(torch.pow(diff, 2), 1) dist = torch.sqrt(dist_sq)
# from methods import * from database import Database import torch import flair import pickle from flair.data import Sentence from flair.embeddings import FlairEmbeddings, DocumentPoolEmbeddings curr_path = os.path.dirname(os.path.abspath(__file__)) data_path = os.path.join(curr_path, "data") flair.device = torch.device('cuda:0') flair.embedding_storage_mode = None flair_emb = DocumentPoolEmbeddings([ FlairEmbeddings('en-forward-fast'), FlairEmbeddings('en-backward-fast') ], pooling='mean', ) cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6) poss_sections = { '#introduction': ['intro', 'introduction', 'starting'], '#abstract': ['abstract', 'abstracts'], '#sota': ['background', 'backgrounds', 'state of the art', 'previous', 'related work'], '#method': ['method', 'methods', 'methodology', 'material', 'materials', 'development', 'description', 'model', 'procedures'], '#experiments_or_results': ['experiments', 'experiment', 'analysis', 'analytics', 'analisy', 'statistics', 'regression', 'analises', 'results', 'result', 'evaluation', 'measures', 'correlation', 'comparison', 'tests', 'test', 'lab', 'laboratory'], '#conclusions': ['conclusion', 'conclusions', 'discussion', 'discussions'], } for list_candidates in poss_sections.values():
def _embed_document(self, document_text: str, doc_embeddings: DocumentPoolEmbeddings): sentence = Sentence(document_text) doc_embeddings.embed(sentence) return sentence.get_embedding().data.cpu().numpy()
import senteval # Set params for SentEval # we use logistic regression (usepytorch: Fasle) and kfold 10 # In this dictionary you can add extra information that you model needs for initialization params_senteval = { 'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': params.folds } b = [] for i in params.model: b.append(BertEmbeddings(i)) #f.append(eval(i)) bert_encoder = DocumentPoolEmbeddings(b) params_senteval['bert'] = bert_encoder print(params_senteval['bert']) nhid = params.nhid params_senteval['classifier'] = { 'nhid': nhid, 'optim': 'adam', 'batch_size': 64, 'tenacity': 5, 'epoch_size': 4 } def prepare(params, samples):
def other_embeddings(embd): sess = tf.InteractiveSession() train_data_list = [] test_data_list = [] val_data_list = [] if embd == 'glove': print('Starting Glove Embedding...') glove_embedding = WordEmbeddings('glove') document_embeddings = DocumentPoolEmbeddings( embeddings=[glove_embedding]) elif embd == 'xlnet': print('Starting XLNet Embedding...') xlnet_embedding = XLNetEmbeddings('xlnet-large-cased') document_embeddings = DocumentPoolEmbeddings( embeddings=[xlnet_embedding]) elif embd == 'fasttext': print('Starting Fasttext Embedding...') fasttext_embedding = WordEmbeddings('en') document_embeddings = DocumentPoolEmbeddings( embeddings=[fasttext_embedding]) elif embd == 'elmo': print('Starting ELMo Embedding...') elmo_embedding = ELMoEmbeddings() document_embeddings = DocumentPoolEmbeddings( embeddings=[elmo_embedding]) else: # init Flair embeddings flair_forward_embedding = FlairEmbeddings('multi-forward') flair_backward_embedding = FlairEmbeddings('multi-backward') glove_embedding = WordEmbeddings('glove') # now create the DocumentPoolEmbeddings object that combines all embeddings document_embeddings = DocumentPoolEmbeddings(embeddings=[ glove_embedding, flair_forward_embedding, flair_backward_embedding ]) print('Train embedding Started...') for text in final_train['text'].tolist(): text = Sentence(text) document_embeddings.embed(text) emb = text.get_embedding().detach().numpy() emb = tf.constant(emb).eval() train_data_list.append(emb) print('Embedded Train data!!') print('Test embedding Started...') for text in final_test['text'].tolist(): text = Sentence(text) document_embeddings.embed(text) emb = text.get_embedding().detach().numpy() emb = tf.constant(emb).eval() test_data_list.append(emb) print('Embedded Test data!!') for text in final_val['text'].tolist(): text = Sentence(text) document_embeddings.embed(text) emb = text.get_embedding().detach().numpy() emb = tf.constant(emb).eval() val_data_list.append(emb) print('Embedded Test data!!') return train_data_list, test_data_list, val_data_list
class FlairEmbeddingsClassifier(BaseEstimator): def __init__( self, word_embeddings: List[Embeddings] = (WordEmbeddings('de'), WordEmbeddings('de-crawl')), pooling: str = 'mean', fine_tune_mode: str = 'nonlinear', distance_metric: str = 'cosine', n_jobs: int = 1, verbose: bool = False): self.word_embeddings = word_embeddings self.pooling = pooling self.fine_tune_mode = fine_tune_mode self.distance_metric = distance_metric self.n_jobs = n_jobs self.verbose = verbose def fit(self, X, y): tag_docs = self._create_tag_corpus(X, self._create_tag_docs(y)) self.document_embedder_ = DocumentPoolEmbeddings( self.word_embeddings, pooling=self.pooling, fine_tune_mode=self.fine_tune_mode) if self.verbose: doc_iterator = tqdm(tag_docs, desc='Computing tag embeddings') else: doc_iterator = tag_docs self.tag_embeddings_ = [] for doc in doc_iterator: doc_obj = Sentence(doc) self.document_embedder_.embed(doc_obj) self.tag_embeddings_.append( doc_obj.get_embedding().detach().numpy()) self.tag_embeddings_ = np.array(self.tag_embeddings_) return self def predict(self, X: List[str], n_labels: int = 10) -> np.array: if not hasattr(self, 'tag_embeddings_'): raise NotFittedError if self.verbose: X_iterator = tqdm( X, desc='Computing embeddings for prediction samples') else: X_iterator = X X_embeddings = [] for doc in X_iterator: doc_obj = Sentence(doc) self.document_embedder_.embed(doc_obj) X_embeddings.append(doc_obj.get_embedding().detach().numpy()) nn = NearestNeighbors(metric=self.distance_metric, n_neighbors=n_labels, n_jobs=self.n_jobs) nn.fit(self.tag_embeddings_) y_pred = lil_matrix((len(X), self.tag_embeddings_.shape[0]), dtype='int8') for sample_ind, text_embedding in enumerate(X_embeddings): nearest_neighbors = nn.kneighbors([text_embedding])[1][0] y_pred[sample_ind, nearest_neighbors] = 1 return y_pred.tocsr() def decision_function(self, X: List[str], n_labels: int = 10): if not hasattr(self, 'tag_embeddings_'): raise NotFittedError if self.verbose: X_iterator = tqdm( X, desc='Computing embeddings for prediction samples') else: X_iterator = X X_embeddings = [] for doc in X_iterator: if doc: doc_obj = Sentence(doc) else: doc_obj = Sentence('Unkown') print('yeah') self.document_embedder_.embed(doc_obj) try: X_embeddings.append(doc_obj.get_embedding().detach().numpy()) except RuntimeError as e: print( 'Could no compute embedding for sample inserting zero vector' ) # TODO give index of corrupted sample print(e) X_embeddings.append( np.zeros((self.tag_embeddings_[1], ), dtype=self.tag_embeddings_.dtype)) nn = NearestNeighbors(metric=self.distance_metric, n_neighbors=n_labels, n_jobs=self.n_jobs) nn.fit(self.tag_embeddings_) y_pred = lil_matrix((len(X), self.tag_embeddings_.shape[0]), dtype='float') for sample_ind, sample_vec in enumerate(X_embeddings): distances, indices = nn.kneighbors([sample_vec]) for distance, label_index in zip(distances, indices): y_pred[sample_ind, label_index] = distance return y_pred.tocsr() def log_decision_function(self, X: Iterable[str], n_labels: int = 10): if not hasattr(self, 'tag_embeddings_'): raise NotFittedError # TODO Uncomment this if sure that nothing will break distances = self.decision_function(X=X, n_labels=n_labels) log_distances = self._get_log_distances(distances) return log_distances def _get_log_distances(self, y_distances: csr_matrix, base=0.5) -> csr_matrix: """ Returns the logarithmic version (base default: 0.5) of the distance matrix returned by TODO. This must be used in order to compute valid precision@k scores since small Distances should be ranked better than great ones. :param y_distances: sparse distance matrix (multilabel matrix with distances instead of binary indicators) :param base: base of the log function (must be smaller then one) :return: sparse matrix with the log values """ log_y_distances = y_distances.tocoo() log_y_distances.data = np.log(log_y_distances.data) / np.log(base) return log_y_distances.tocsr() def _create_tag_corpus(self, X: np.array, tag_doc_idx: np.array) -> List[str]: """ Creates the corpus used to train the tag embeddings. Each text associated with one tag is concatenated to one big document. :param X: Iterable of the texts as string :param tag_doc_idx: Mapping of each label to their associated texts :return: list of shape (n_tags,) containing the texts """ tag_corpus = list() if self.verbose: print('Creating Tag-Doc Corpus') iterator = tqdm(tag_doc_idx) else: iterator = tag_doc_idx for indices in iterator: tag_corpus.append(" ".join(X[indices])) return tag_corpus def _create_tag_docs(self, y: csr_matrix) -> np.ndarray: """ Creates a mapping of each tags and their associated texts. :param y: sparse label matrix :return: array of shape (n_labels,) containing the indices of each text connected to a label """ self.classes_ = y.shape[1] if self.verbose: print('Sorting tag and docs') iterator = tqdm(y.T) else: iterator = y.T tag_doc_idx = list() for tag_vec in iterator: pos_samples = tag_vec.nonzero()[1] # get indices of pos samples tag_doc_idx.append(pos_samples) return np.asarray(tag_doc_idx)
class HyperpartisanDatasetFlair(HyperpartisanDataset, FlairDataset): """ Hyperpartisan News Dataset using flair-based embeddings. """ def __init__(self, articles: Sequence[NewsArticle], max_seq_len: int = 200, granularity: Union[str, Sequence[str]] = 'token', use_title: bool = True, max_sent_len: int = 100, embeddings: Sequence[str] = ['word'], avg_layers: Optional[int] = None, use_cuda: bool = False): super().__init__( articles=articles, max_seq_len=max_seq_len, granularity=granularity, max_sent_len=max_sent_len, use_title=use_title, ## HyperpartisanDataset.__init__ args embeddings=embeddings, use_cuda=use_cuda, ## FlairDataset.__init__ args ) self.embeddings = DocumentPoolEmbeddings(self.token_embeddings, pooling='mean') self.avg_layers = avg_layers print('\nEmbeddings Model:') print(self.embeddings, end='\n\n') self.nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'tagger']) self.nlp.add_pipe(self.nlp.create_pipe('sentencizer')) def _get_span_embedding(self, text: str, max_seq_len: Optional[int] = None) -> torch.Tensor: """ Returns embeddings for the given sentence's text, in shape: (embeddings_dim,) """ if len(text) < 2: print('Sentence is too short: "{}"'.format(text)) return torch.zeros(self.embeddings.embedding_length, dtype=torch.float32) s = Sentence(text) if max_seq_len is not None and len( s ) > max_seq_len and self.embeddings_type != 'elmo': ## Don't crop ELMo sentences, just an experiment s.tokens = s.tokens[:max_seq_len] if self.embeddings_type == 'bert' or self.bert_tokenizer is not None: sent_len = self.crop_sentence_to_fit_bert(s) if sent_len == 0 or len(s) == 0: return torch.zeros(self.embeddings.embedding_length, dtype=torch.float32) self.embeddings.embed(s) return s.embedding def get_tokenwise(self, article: NewsArticle) -> torch.Tensor: return self._get_tokenwise_embeddings( (article.get_title() if self.use_title else "") + article.get_text(), self.max_seq_len) def get_documentwise(self, article: NewsArticle) -> torch.Tensor: """Returns document-wise embeddings""" text = (article.get_title() if self.use_title else "") + article.get_text() return self._get_span_embedding(text, self.max_seq_len) def get_sentencewise(self, article: NewsArticle): X = torch.zeros(self.max_seq_len, self.embeddings.embedding_length, dtype=torch.float32) # Title embedding if self.use_title: X[0] = self._get_span_embedding(article.get_title(), self.max_sent_len) # Sentence embeddings for i, s in enumerate( self.nlp(article.get_text()).sents, 1 if self.use_title else 0): if i >= self.max_seq_len: break X[i] = self._get_span_embedding(s.text, self.max_sent_len) if self.avg_layers is not None: return self._avg_last_n_layers(X, self.avg_layers) return X def _avg_last_n_layers(self, X, last_n_layers): """ Averages the last_n_layers from the given embedding representation, instead of the default concatenation. """ final_emb_len = X.shape[-1] // last_n_layers assert X.shape[-1] % last_n_layers == 0 X_new = torch.zeros(X.shape[0], final_emb_len, dtype=torch.float32) for i, emb in enumerate(X): for k in range(last_n_layers): X_new[i] += emb[k * final_emb_len:(k + 1) * final_emb_len] X_new[i] /= last_n_layers return X_new def get_tokenwise_grouped(self, article: NewsArticle) -> torch.Tensor: """Returns token-wise embeddings grouped by sentences""" X = torch.zeros(self.max_seq_len, self.max_sent_len, self.get_embeddings_dim(), dtype=torch.float32) # Title embedding if self.use_title: X[0] = self._get_tokenwise_embeddings(article.get_title(), self.max_sent_len) # Text embeddings for i, sent in enumerate( self.nlp(article.get_text()).sents, 1 if self.use_title else 0): if i >= self.max_seq_len: break X[i] = self._get_tokenwise_embeddings(sent.text, self.max_sent_len) return X def get_embeddings_dim(self) -> int: return \ self.embeddings.embedding_length if self.avg_layers is None else \ self.embeddings.embedding_length // self.avg_layers
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings, Sentence from flair.data_fetcher import NLPTaskDataFetcher from flair.embeddings import BertEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from pathlib import Path glove_embedding = WordEmbeddings('glove') bert_embedding = BertEmbeddings('bert-base-uncased') corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./'), test_file='test.csv', dev_file='dev.csv', train_file='train.csv') document_embeddings = DocumentPoolEmbeddings([bert_embedding, glove_embedding]) classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=True) trainer = ModelTrainer(classifier, corpus) trainer.train('./', max_epochs=10)
import datetime import spacy from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, DocumentLSTMEmbeddings, Sentence nlp = spacy.load('de') glove_embedding = WordEmbeddings('de') flair_embedding_forward = FlairEmbeddings('german-forward') flair_embedding_backward = FlairEmbeddings('german-backward') document_pooling_embeddings = DocumentPoolEmbeddings( [glove_embedding, flair_embedding_backward, flair_embedding_forward]) document_lstm_embeddings = DocumentLSTMEmbeddings( [glove_embedding, flair_embedding_backward, flair_embedding_forward]) def is_blacklisted(word): return word in [ 'polizei', 'polizist', 'beamter', 'nr.', 'berlin', 'uhr', 'polizeimeldung', 'nicht', 'jahr', 'jährige', 'jährig', 'jähriger', 'polizeiliche', 'polizeilich', '2015', '2016', '2014', '2017', '2018', 'polizeibeamter', '-', 'u.a.', 'z.b.', 'der', 'die', 'das', 'dem', 'den', 'diese', 'dieser', 'diesen', 'diesem', 'um', 'für', 'eine', 'ein', 'einer', 'einen', 'einem', 'anderer', 'andere', 'anderen', 'anders' ] def is_empty(word): return word.strip() == ''
class FlairTextEncoder(BaseTextTorchEncoder): """ :class:`FlairTextEncoder` encodes data from an array of string in size `B` into a ndarray in size `B x D`. Internally, :class:`FlairTextEncoder` wraps the DocumentPoolEmbeddings from Flair. """ def __init__(self, embeddings: Union[Tuple[str], List[str]] = ('word:glove', 'flair:news-forward', 'flair:news-backward'), pooling_strategy: str = 'mean', *args, **kwargs): """ :param embeddings: the name of the embeddings. Supported models include - ``word:[ID]``: the classic word embedding model, the ``[ID]`` are listed at https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md - ``flair:[ID]``: the contextual embedding model, the ``[ID]`` are listed at https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/FLAIR_EMBEDDINGS.md - ``pooledflair:[ID]``: the pooled version of the contextual embedding model, the ``[ID]`` are listed at https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/FLAIR_EMBEDDINGS.md - ``byte-pair:[ID]``: the subword-level embedding model, the ``[ID]`` are listed at https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md :param pooling_strategy: the strategy to merge the word embeddings into the chunk embedding. Supported strategies include ``mean``, ``min``, ``max``. """ super().__init__(*args, **kwargs) self.embeddings = embeddings self.pooling_strategy = pooling_strategy self.max_length = -1 # reserved variable for future usages self._post_set_device = False def post_init(self): import flair flair.device = self.device from flair.embeddings import WordEmbeddings, FlairEmbeddings, BytePairEmbeddings, PooledFlairEmbeddings, \ DocumentPoolEmbeddings embeddings_list = [] for e in self.embeddings: model_name, model_id = e.split(':', maxsplit=1) emb = None try: if model_name == 'flair': emb = FlairEmbeddings(model_id) elif model_name == 'pooledflair': emb = PooledFlairEmbeddings(model_id) elif model_name == 'word': emb = WordEmbeddings(model_id) elif model_name == 'byte-pair': emb = BytePairEmbeddings(model_id) except ValueError: self.logger.error('embedding not found: {}'.format(e)) continue if emb is not None: embeddings_list.append(emb) if embeddings_list: self.model = DocumentPoolEmbeddings(embeddings_list, pooling=self.pooling_strategy) self.logger.info('flair encoder initialized with embeddings: {}'.format(self.embeddings)) else: self.logger.error('flair encoder initialization failed.') @batching @as_ndarray def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray': """ :param data: a 1d array of string type in size `B` :return: an ndarray in size `B x D` """ import torch from flair.embeddings import Sentence c_batch = [Sentence(row) for row in data] self.model.embed(c_batch) result = torch.stack([c_text.get_embedding() for c_text in c_batch]).detach() if self.on_gpu: result = result.cpu() return result.numpy()
class FlairBackend(BaseEmbedder): """ Flair Embedding Model The Flair embedding model used for generating document and word embeddings. Arguments: embedding_model: A Flair embedding model Usage: ```python from bertopic.backend import FlairBackend from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings # Create a Flair Embedding model glove_embedding = WordEmbeddings('crawl') document_glove_embeddings = DocumentPoolEmbeddings([glove_embedding]) # Pass the Flair model to create a new backend flair_embedder = FlairBackend(document_glove_embeddings) ``` """ def __init__(self, embedding_model: Union[TokenEmbeddings, DocumentEmbeddings]): super().__init__() # Flair word embeddings if isinstance(embedding_model, TokenEmbeddings): self.embedding_model = DocumentPoolEmbeddings([embedding_model]) # Flair document embeddings + disable fine tune to prevent CUDA OOM # https://github.com/flairNLP/flair/issues/1719 elif isinstance(embedding_model, DocumentEmbeddings): if "fine_tune" in embedding_model.__dict__: embedding_model.fine_tune = False self.embedding_model = embedding_model else: raise ValueError("Please select a correct Flair model by either using preparing a token or document " "embedding model: \n" "`from flair.embeddings import TransformerDocumentEmbeddings` \n" "`roberta = TransformerDocumentEmbeddings('roberta-base')`") def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: """ Embed a list of n documents/words into an n-dimensional matrix of embeddings Arguments: documents: A list of documents or words to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ embeddings = [] for index, document in tqdm(enumerate(documents), disable=not verbose): try: sentence = Sentence(document) if document else Sentence("an empty document") self.embedding_model.embed(sentence) except RuntimeError: sentence = Sentence("an empty document") self.embedding_model.embed(sentence) embedding = sentence.embedding.detach().cpu().numpy() embeddings.append(embedding) embeddings = np.asarray(embeddings) return embeddings
class AlbertPre: def __init__(self, MAX_WORD_N=150, MAX_SENT_N=30, MAX_WORD_SENT_N=300, alber_model="albert-base-v2") -> None: super().__init__() albert = BertEmbeddings(bert_model_or_path=alber_model) self.albert_embedding = DocumentPoolEmbeddings([albert]) self.MAX_WORD_N = MAX_WORD_N self.MAX_SENT_N = MAX_SENT_N self.MAX_WORD_SENT_N = MAX_WORD_SENT_N self.sentence_piecer = MySentencePiecer() def get_embedding(self, sentence): sent = Sentence(sentence) self.albert_embedding.embed(sent) return sent.get_embedding() @staticmethod def split_in_sentences(text): return split_single(text) @staticmethod def load_csv(name): return pd.read_csv(str("../data/" + name + ".tsv"), sep="\t") def load_data(self): train_df = self.load_csv("train") test_df = self.load_csv("test") val_df = self.load_csv("val") return train_df, val_df, test_df def embed_sentences(self, sentences): arr_embedding = np.zeros((self.MAX_SENT_N, 3072)) for i, sentence in enumerate(sentences): if len(sentence) > 0 and i < self.MAX_SENT_N: x = self.get_embedding(sentence[:self.MAX_WORD_SENT_N]) x = x.to('cpu').detach().numpy() arr_embedding[i] = x return arr_embedding def compute_and_save_df(self, ds, name): path = "../data/%s" % (name) if not os.path.exists(path): os.mkdir(path) len_ds = len(ds) article_np = np.memmap(str(path + "/articles.npy"), dtype=np.float32, mode='w+', shape=(len_ds, 30, 3072)) highlight_list = [] n_highlight_list = [] n_article_list = [] for i, (article, highlight) in ds.iterrows(): article_sent = self.split_in_sentences(article) n_areticle = len(article_sent) article_np[i] = self.embed_sentences(article_sent) highlight_ids = np.array(self.sentence_piecer.get_ids_from_vocab(highlight))[:self.MAX_WORD_N] highlight_list.append(highlight_ids) n_highlight_list.append(highlight_ids.shape[0]) n_article_list.append(n_areticle) if (i % 1000) == 0: print("computed [%d/%d]" % (i, len_ds)) np.save(str(path + "/n_highlights" + ".npy"), n_highlight_list) np.save(str(path + "/n_articles" + ".npy"), n_article_list) np.save(str(path + "/highlights" + ".npy"), highlight_list) @staticmethod def load_np_files(name): path = "../data/%s" % name article_np = np.load(str(path + "/article" + ".npy"), allow_pickle=True) n_highlights = np.load(str(path + "/n_highlights" + ".npy"), allow_pickle=True) n_articles = np.load(str(path + "/n_articles" + ".npy"), allow_pickle=True) highlights = np.load(str(path + "/highlights" + ".npy"), allow_pickle=True) return article_np, n_articles, highlights, n_highlights
def training_pipeline_bert(filepath=None, num_words_to_print=10, prefix=None, min_topics=19, max_topics=19, step=2): logging.info(f'Started training_pipeline : {min_topics}-{max_topics}') start = datetime.datetime.now() if filepath is not None: filepath = data_dir_local / filepath else: logging.error("Please enter file name") exit() if max_topics is None: logging.error("Please enter a valid topic number to train model") exit() #logging.info(f'preprocessor.process_data_save: {filepath}') #preprocessor.process_data_save(filepath=filepath, as_text=as_text, as_pickle=as_pickle, verbose=verbose) #logging.info(f'phraser.raw_to_phrased_data_pipeline...') #phraser.raw_to_phrased_data_pipeline(to_load='text', verbose=True, overwrite_interim=True, prefix=None) col = cols[0] df = phraser.load_phrased_data_pipeline(to_load='text', verbose=True, overwrite_interim=True, prefix=None, training=True, col='resp_whytfa') #if prefix is None: # prefix = '' # for topic modeling #trigram_docs_filepath = data_dir_processed / f'{prefix}{col}_transformed_docs_all.txt' #trigram_docs_filepath = f'/home/watsonrtdev/topic_modeling/input_data/topic_modeling/training/processed/{prefix}{col}_transformed_docs_all.txt' #trigram_docs_filepath = f'/home/watsonrtdev/topic_modeling/input_data/topic_modeling/training/processed/processed_dataframe.csv' #print(f'Loading input file {trigram_docs_filepath}') # turn to posix filepaths until gensim supports this #trigram_docs_filepath = trigram_docs_filepath.as_posix() #trigram_docs = LineSentence(trigram_docs_filepath) #df = pd.read_csv(trigram_docs_filepath) #print(df.columns) #default it to min/max topics num_topics_range = range(min_topics, max_topics + 1, step) #if num_topics is not None: # num_topics_range = range(num_topics, num_topics + 1, step) print('Num_topics_range={}'.format(num_topics_range)) #Contextual string embeddings are powerful embeddings that capture latent syntactic-semantic information that goes beyond standard word embeddings. Key differences are: (1) they are trained without any explicit notion of words and thus fundamentally model words as sequences of characters. And (2) they are contextualized by their surrounding text, meaning that the same word will have different embeddings depending on its contextual use. # initialise embedding classes flair_embedding_forward = FlairEmbeddings('news-forward') flair_embedding_backward = FlairEmbeddings('news-backward') bert_embedding = BertEmbeddings('bert-base-uncased') # combine word embedding models document_embeddings = DocumentPoolEmbeddings( [bert_embedding, flair_embedding_backward, flair_embedding_forward]) # set up empty tensor X = torch.empty(size=(len(df.index), 7168)) #.cuda() # fill tensor with embeddings # for text in tqdm(df['resp_whytfa']): #df['text_cl']): #from tqdm import tqdm - show smart progress meter i = 0 for text in df['resp_whytfa']: sentence = Sentence(text) document_embeddings.embed(sentence) embedding = sentence.get_embedding() X[i] = embedding i += 1 if (i > 100): break print("before the PCA") #detach the tensor from the GPU and convert it to a NumPy array Y = X.cpu().detach().numpy() #del(X) #torch.cuda.empty_cache() #We want to cluster these vectors into topics, and we’ll invoke Agglomerative Clustering with Ward affinity from scikit-learn to do so. #Bottom-up hierarchical clustering algorithms have a memory complexity of O(n²), so we’ll use Principal Component Analysis to speed up this process. #As a side note, I did test a number of clustering algorithms (K-means, BIRCH, DBSCAN, Agglomerative with complete/average affinity), but Ward seems to perform the best in most cases #reduce the dimensionality of our vectors to length 768 pca = IncrementalPCA(copy=False, n_components=768, batch_size=1000) #pca = PCA(n_components=768) X_red = pca.fit_transform(X) del (X) print("After the fit_transform") N_CLUSTERS = 5 # WARD CLUSTER ward = AgglomerativeClustering(n_clusters=N_CLUSTERS, affinity='euclidean', linkage='ward') pred_ward = ward.fit_predict(X_red) print("After fit_predict") df['topic'] = pred_ward df.to_csv('bert_withtopic.csv') print("Write bert_withtopic.csv") #get topic composition topic_docs = [] # group text into topic-documents for topic in range(N_CLUSTERS): topic_docs.append(' '.join( df[df['cluster'] == topic]['text_cl'].values)) # apply function df_tfidf = get_top_words(topic_docs, 10) print(f"Top words: df_tfidf") #How good are our topics? #We find the centroids of the vectors by averaging them across each topic: topic_centroids = [] for topic in tqdm(range(N_CLUSTERS)): X_topic = X_red[df.index[df['cluster'] == topic]] X_mean = np.mean(X_topic, axis=0) topic_centroids.append(X_mean) #calculate the euclidean distance of each Tweet vector to their respective topic centroid: topic_distances = [] for row in tqdm(df.index): topic_centroid = topic_centroids[df.iloc[row]['cluster']] X_row = X_red[row] topic_distance = euclidean(topic_centroid, X_row) topic_distances.append(topic_distance) df['topic_distance'] = topic_distances #visualise the distribution of distances to the topic centroid #The closer the distribution to the left of the graph, the more compact the topic is df.to_csv('bert_withtopic_distance.csv') print('Write bert_withtopic_distance.csv') #topic similarity - how similar the topics are to each other #We will construct a euclidean distance matrix between the 10 topic centroids to find the distance between the topic averages df_dist_matrix = pd.DataFrame(distance_matrix(topic_centroids, topic_centroids), index=range(N_CLUSTERS), columns=range(N_CLUSTERS)) print(f"df_dist_matrix={df_dist_matrix}") with open('df_dist_matrix', 'w') as fout: fout.write(u'#' + '\t'.join(str(e) for e in df_dist_matrix.shape) + '\n') df_dist_matrix.tofile(fout)
class FormFieldSimilarityFinder: """ The purpose of this class is to generate a Vector for each of the form field based on 3 predefined label description and store those in a pickle file """ def __init__(self): # Initialize Form fields and their description self.name_field = ['Name of a person','A word by which a person is known',"Identity to call a person"] self.age_field = ['Age of a person','Number which tells how old a person is','The length of time a person has lived'] self.address_field = ['Home Address of a person','A place of residence','Place of stay'] # Intialize a dictionary with form fields and corresponding description list self.form_fields = {'Name':self.name_field, 'Age':self.age_field, 'Address':self.address_field} # Load all the Pretrained-Models self.elmo_embedding = ELMoEmbeddings() self.flair_forward_embedding = FlairEmbeddings('multi-forward') self.flair_backward_embedding = FlairEmbeddings('multi-backward') self.bert_embedding = BertEmbeddings('bert-base-multilingual-uncased') # Stack all the embeddings using DocumentPoolEmbeddings self.stacked_embedding = DocumentPoolEmbeddings(embeddings=[self.elmo_embedding, self.flair_forward_embedding,self.flair_backward_embedding,self.bert_embedding]) # A threshold value, only above which the match is considered self.threshold_value = 0.70 def construct_vector(self, original_sentence): """ Given a sentence, Contruct and return a vector based on different stacked embeddings """ sentence = Sentence(original_sentence) self.stacked_embedding.embed(sentence) sentence_embedding = sentence.get_embedding() sentence_embedding_array = sentence_embedding.detach().numpy() return sentence_embedding_array def construct_category_vector(self, category_definitions): """ Given a set of Category definitions, construct vector for each using Stacked embedding and return mean of all the vectors """ category_vectors = [] for each in category_definitions: sentence_embedding_array = self.construct_vector(each) category_vectors.append(sentence_embedding_array) single_vector = np.mean(category_vectors,0) return single_vector def store_category_vectors(self): """ Build a Vector for each category and store it in a npz file """ field_vector_dict = {} for field, description_list in self.form_fields.items(): # Get a vector for each of the category using Stacked Embedding field_vector = construct_category_vector(description_list) field_vector_dict[field] = field_vector np.savez("field_vector.npz",**field_vector_dict) @staticmethod def find_similarity(vector1, vector2, method = "cosine"): """ Find Similarity between two vectors based on the given similarity measure """ sim_score = 0 if "cosine": sim_score = cosine_similarity(vector1, vector2) elif "manhattan": sim_score = sum(abs(val1-val2) for val1,val2 in zip(vector1,vector2)) return sim_score def find_matching_field(self, user_field): """ Method to find the closest matching field for a given form field """ field_vectors = np.load('field_vector.npz') user_field_vector = self.construct_vector(user_field) similarity_dict = {} for field, vector in field_vectors.items(): similarity_dict[field] = find_similarity(vector.reshape(1,-1),user_field_vector.reshape(1,-1)) similarity_dict = {key: value for key, value in similarity_dict.items() if value>self.threshold_value} if similarity_dict: max_pair = max(similarity_dict.items(), key=operator.itemgetter(1)) confidence = float("{0:.2f}".format(max_pair[1][0][0]))*100 print(f"Closest Match to the field is '{max_pair[0]}' with confidence: {confidence}%") return max_pair else: print("No Confident Match is found!!!") return None