def embedize(self, data_subset_list): tweet = Sentence(data_subset_list) embedding = TransformerDocumentEmbeddings(self.embedding) embedding.embed(tweet) tweet_emb = tweet.get_embedding() tweet_emb_np = tweet_emb.detach().numpy() return (tweet_emb_np)
class Embedding: """ Performs embedding on sentences. """ def __init__(self, model='gpt2-medium'): """ Initializes the embedding model. :param {str} model - The model architecture. Must be one of https://huggingface.co/transformers/pretrained_models.html """ self.model = TransformerDocumentEmbeddings(model, batch_size=8) def embed(self, sentence: str) -> list: """ Embeds a given sentence. If it fails, returns None. :param {str} sentence - A cased or uncased sentence. """ if isinstance(sentence, bytes): sentence = sentence.decode('ascii') if isinstance(sentence, list): sentence = ' '.join(sentence) if sentence == '': return None try: sent = Sentence(sentence) self.model.embed(sent) return sent.embedding.detach().cpu().numpy() except TypeError: return None
def create_embeddings_flair(data: pd.DataFrame, column: str = "text", path: str = None, embeddings_type: str = "tranformer", typs: str = "train"): assert column in data.columns.tolist( ), "[embeddings.py] -> [create_embedding_flair] -> Input column not in dataframe columns" assert embeddings_type in ["tranformer", "stacked"] from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, DocumentPoolEmbeddings, TransformerDocumentEmbeddings from flair.data import Sentence fast_text_embedding = WordEmbeddings('de') flair_embedding_forward = FlairEmbeddings('de-forward') flair_embedding_backward = FlairEmbeddings('de-backward') stacked_embeddings = DocumentPoolEmbeddings([ fast_text_embedding, flair_embedding_forward, flair_embedding_backward ]) transformer_embedding = TransformerDocumentEmbeddings( 'bert-base-german-cased', fine_tune=False) tic = time.time() embeddings = [] for i, text in enumerate(data[column].values): print("sentence {}/{}".format(i, len(data))) sentence = Sentence(text) if embeddings_type == "stacked": stacked_embeddings.embed(sentence) elif embeddings_type == "tranformer": transformer_embedding.embed(sentence) embedding = sentence.embedding.detach().cpu().numpy() embeddings.append(embedding) embeddings = np.array(embeddings) columns = [ "embedding_{}".format(feature) for feature in range(embeddings.shape[1]) ] csv = pd.DataFrame(embeddings, columns=columns) csv.to_csv(path + embeddings_type + "_" + typs + ".csv", index=False) toc = time.time() print( "[create_embeddings_flair] -> [embeddings_type: {}, typs: {}] -> time {}'s" .format(embeddings_type, typs, toc - tic))
def test_transformer_document_embeddings(): embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased') sentence: Sentence = Sentence("I love Berlin") embeddings.embed(sentence) assert len(sentence.get_embedding()) == 768 sentence.clear_embeddings() assert len(sentence.get_embedding()) == 0 embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', layers='all') embeddings.embed(sentence) assert len(sentence.get_embedding()) == 5376 sentence.clear_embeddings() assert len(sentence.get_embedding()) == 0 embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', layers='all', layer_mean=True) embeddings.embed(sentence) assert len(sentence.get_embedding()) == 768 sentence.clear_embeddings() del embeddings
def vectorize(self, X): # init embedding model print(f"Load {self.model_name} model ...") model = TransformerDocumentEmbeddings(self.model_name, fine_tune=False) # convert to Sentence objects print("Convert to Sentence objects ...") X = X.str.lower() sentences = X.progress_apply(lambda x: Sentence(x)) # get vectors from BERT print(f"Get {self.model_name} embeddings ...") docvecs = sentences.progress_apply(lambda x: model.embed(x)) docvecs = sentences.progress_apply(lambda x: x.embedding.cpu().numpy()) return list(docvecs)
from flair.data import Sentence from flair.embeddings import TransformerDocumentEmbeddings import pandas as pd import numpy as np import sys text_col = "review_text" id_col = "row_id" file_name = sys.argv[1] batch_size = int(sys.argv[2]) df = pd.read_csv(file_name) embedding = TransformerDocumentEmbeddings('bert-base-uncased') outs = list() df['batch'] = np.arange(len(df)) // batch_size for b in df['batch'].unique(): print(b) current_batch = df[df['batch'] == b] out = current_batch[text_col].apply(lambda k: pd.Series( embedding.embed(Sentence(k))[0].embedding.tolist())) out = pd.concat([current_batch[id_col], out], axis=1) outs.append(out) outs = pd.concat(outs) outs.columns = [id_col] + ['emb_' + str(c) for c in outs.columns[1:]] outs.to_csv("embeddings.csv")
class CompanyMatcher: companies: Dict[str, Company] = {} embedding: Union[TransformerDocumentEmbeddings, SentenceTransformerDocumentEmbeddings] description_embeddings: Dict[str, torch.tensor] = {} embeddings_pkl_file_path: Path = Path("description-embeddings.pkl") geolocator: Nominatim = Nominatim(user_agent="company-matcher") headquarters_locations: Dict[str, Tuple[float, float]] = {} locations_pkl_file_path: Path = Path("headquarters-locations.pkl") _similarity_component_weights: Dict[str, float] = { "description": 0.6, "founded": 0.2, "headquarters": 0.2 } _founding_year_normalizer: int = None _hq_location_normalizer: int = 20000 # half of earth diameter # largest distance between two headquarters takes two long to compute # (at least with the inefficient method implemented in _calc_hq_location_normalizer def __init__(self, company_file_path: Path, transformer_model: str, sentence_transformer: bool = False, similarity_component_weights: Dict[str, float] = None, load_n: int = None): self._load_transformer_model(transformer_model, sentence_transformer) self._load_companies(company_file_path, load_n) self._embed_descriptions() self._locate_headquarters() self._load_similarity_component_weights(similarity_component_weights) self._calc_founding_year_normalizer() def _load_similarity_component_weights( self, similarity_component_weights: Dict[str, float]): if similarity_component_weights: if {"description", "founded", "headquarters" } == set(self._similarity_component_weights.keys()): self._similarity_component_weights = similarity_component_weights else: logger.warning( f"Invalid similarity component weights gives: {similarity_component_weights}" ) logger.warning("Using default values!") def _load_transformer_model(self, transformer_model: str, sentence_transformer: bool): logger.info("Loading transformer model...") if sentence_transformer: try: self.embedding = SentenceTransformerDocumentEmbeddings( transformer_model, ) except OSError as e: logger.error("Could not load transformer model: " + str(e)) exit() else: try: self.embedding = TransformerDocumentEmbeddings( transformer_model, fine_tune=False) except OSError as e: logger.error("Could not load sentence transformer model: " + str(e)) exit() logger.info("Done loading transformer model!") def _load_companies(self, company_file_path: Path, load_n: int): logger.info("Loading company data from file...") try: json_data = load_json(company_file_path) except OSError as e: logger.error("Could not company data file: " + str(e)) exit() if 0 < load_n <= len(json_data): json_data = json_data[:load_n] try: companies_list = [Company(**entry) for entry in json_data] except ValidationError as e: logger.error("Company data does not follow valid format: " + str(e)) exit() try: companies_url_list = [c.url for c in companies_list] assert len(companies_url_list) == len(set(companies_url_list)) except AssertionError: logger.warning("Company URLs are not unique!") # check which which companies are duplicates duplicate_company_urls = [] for company in companies_list: if self.companies.get(company.url, None): duplicate_company_urls.append(company.url) else: self.companies[company.url] = company logger.warning( f"Following company URLs have multiple entries: {duplicate_company_urls}" ) logger.warning("Duplicate entries will be ignored!") logger.info("Done loading company data!") def _embed_descriptions(self, chunk_size: int = 30, load_from_pickle: bool = True, save_to_pickle: bool = True): if load_from_pickle: self.description_embeddings = load_pickle( self.embeddings_pkl_file_path, error_msg="Could not load stored embeddings!") descriptions_ = [(company.url, Sentence(company.description)) for company in self.companies.values() if company.url not in self.description_embeddings] # chunking for progress bar if descriptions_: logger.info("Computing description embeddings...") with tqdm(total=len(descriptions_)) as pbar: for start_idx in range(0, len(descriptions_), chunk_size): end_idx = start_idx + chunk_size if not end_idx < len(descriptions_): end_idx = len(descriptions_) chunk_size = end_idx - start_idx descriptions_chunk = descriptions_[start_idx:end_idx] self.embedding.embed([ description_[1] for description_ in descriptions_chunk ]) self.description_embeddings.update({ description_[0]: description_[1].embedding for description_ in descriptions_chunk }) # remove embedding from sentence objects for _, description_sentence in descriptions_chunk: description_sentence.clear_embeddings() if save_to_pickle: save_pickle( object_=self.description_embeddings, pkl_file_path=self.embeddings_pkl_file_path, error_msg="Could not save new embeddings!") pbar.update(chunk_size) # DEBUGGING # # snapshot = tracemalloc.take_snapshot() # display_top_malloc_lines(snapshot) # --------- # logger.info("Done computing description embeddings!") def _locate_headquarters(self, load_from_pickle: bool = True, save_to_pickle: bool = True): if load_from_pickle: self.headquarters_locations = load_pickle( self.locations_pkl_file_path, error_msg="Could not load stored locations!") not_located_companies = [ company_url for company_url in self.companies if company_url not in self.headquarters_locations ] # not_located_companies = [] # DEBUGGING if not_located_companies: logger.info("Geo-locating company headquarters...") for company_url in not_located_companies: company = self.companies[company_url] if company.headquarters and not self.headquarters_locations.get( company_url, None): location = None try: location = self.geolocator.geocode( company.headquarters) except (MaxRetryError, GeocoderUnavailable): pass if location: self.headquarters_locations[company_url] = ( location.latitude, location.longitude) if save_to_pickle: save_pickle( object_=self.headquarters_locations, pkl_file_path=self.locations_pkl_file_path, error_msg="Could not save locations!") logger.info("Done locating company headquarters!") def _calc_founding_year_normalizer(self): company_founding_years = [ company.founded for company in self.companies.values() if company.founded ] min_founding_year = min(company_founding_years) max_founding_year = max(company_founding_years) self._founding_year_normalizer = max_founding_year - min_founding_year # function is too inefficient for datasets with more than a few hundred entries # def _calc_hq_location_normalizer(self): # location_pairs = combinations(self.headquarters_locations.values(), 2) # location_distances = [geodesic(location_pair[0], location_pair[1]).kilometers # for location_pair in location_pairs] # self._hq_location_normalizer = max(location_distances) # -----------------------------# # --- Similarity functions --- # def _description_similarities(self, query_company: Company): query_embedding = self.description_embeddings[query_company.url] return { candidate_url: float(cos_similarity(query_embedding, candidate_embedding)) for candidate_url, candidate_embedding in self.description_embeddings.items() if candidate_url != query_company.url } def _founded_similarities(self, query_company: Company): return { company.url: self._calc_founded_similarity(query_company, company) for company in self.companies.values() if company.url != query_company.url } def _calc_founded_similarity( self, query_company: Company, candidate_company: Company) -> Union[float, None]: if query_company.founded and candidate_company.founded: return 1 - abs(query_company.founded - candidate_company.founded ) / float(self._founding_year_normalizer) else: return None def _headquarters_similarities(self, query_company: Company): return { company.url: self._calc_headquarters_similarity(query_company, company) for company in self.companies.values() if company.url != query_company.url } def _calc_headquarters_similarity( self, query_company: Company, candidate_company: Company) -> Union[float, None]: query_location = self.headquarters_locations.get( query_company.url, None) candidate_location = self.headquarters_locations.get( candidate_company.url, None) if query_location and candidate_location: return 1 - geodesic(query_location, candidate_location).kilometers / float( self._hq_location_normalizer) else: return None def get_peers(self, query_url: str, top_k: int = 10) -> Union[List[str], None]: if query_url not in self.companies: logger.warning(f"Company with URL '{query_url}' does not exist!") return None query_company = self.companies[query_url] description_similarities = self._description_similarities( query_company) founded_similarities = self._founded_similarities(query_company) headquarters_similarities = self._headquarters_similarities( query_company) similarities = [] for company_url in description_similarities: if founded_similarities[company_url] and headquarters_similarities[ company_url]: similarity = ( self._similarity_component_weights['description'] * description_similarities[company_url] + self._similarity_component_weights['founded'] * founded_similarities[company_url] + self._similarity_component_weights['headquarters'] * headquarters_similarities[company_url]) elif founded_similarities[ company_url] and not headquarters_similarities[company_url]: weight_normalizer = self._similarity_component_weights[ 'description'] + self._similarity_component_weights[ 'founded'] similarity = (self._similarity_component_weights['description'] * description_similarities[company_url] + self._similarity_component_weights['founded'] * founded_similarities[company_url] ) / float(weight_normalizer) elif not founded_similarities[ company_url] and headquarters_similarities[company_url]: weight_normalizer = self._similarity_component_weights[ 'description'] + self._similarity_component_weights[ 'headquarters'] similarity = ( self._similarity_component_weights['description'] * description_similarities[company_url] + self._similarity_component_weights['headquarters'] * headquarters_similarities[company_url] ) / float(weight_normalizer) else: weight_normalizer = self._similarity_component_weights['description'] \ + self._similarity_component_weights['founded'] + self._similarity_component_weights['headquarters'] similarity = description_similarities[company_url] / float( weight_normalizer) similarities.append((company_url, similarity)) sorted_similarities = sorted(similarities, key=lambda x: x[1], reverse=True) return [x[0] for x in sorted_similarities][:top_k]