def __init__(self, verbose=1): # load laserembeddings models # if they're missing, we're downloading them try: self.laser = Laser() except: if verbose > 0: print("WARNING laserembeddings models missing, downloading ...") os.system("python -m laserembeddings download-models") self.laser = Laser() # load reviews csv # if it's missing, we're generating it # it is generated with "generate_csv_from_reviews.py" file, who is based on reviews in "sorted_data" if not os.path.isfile("labeled_reviews.csv"): if verbose > 0: print("WARNING csv missing, generating ...") start_timer = time.time() generate_csv_from_reviews.generate_csv_from_reviews("labeled_reviews.csv") if verbose > 0: print("time to generate:", round(time.time() - start_timer, 2), "s") # load stopwords f = open("sorted_data/stopwords", "r") self.stopwords = f.read().split("\n") self.stopwords.pop(-1) self.df_reviews = pd.read_csv("labeled_reviews.csv") # initialise model as False so we know he isnt already loaded self.model = False if verbose > 0: print("SentimentAnalyse ready to use.")
def __init__(self): if Singletons.__instance is not None: raise Exception("The singleton is already initialized you are attempting to initialize it again get lost") else: logger.info("Initializing Laser embedder") self.laser_embedder = Laser() Singletons.__instance = self
def post_init(self): from laserembeddings import Laser self.model = Laser( bpe_codes=self._path_to_bpe_codes, bpe_vocab=self._path_to_bpe_vocab, encoder=self._path_to_encoder, ) self.to_device(self.model.bpeSentenceEmbedding.encoder.encoder)
def getSentenceVector(doc, model_params: dict = {}, encoder = "distilbert", model_name = 'distilbert-base-nli-mean-tokens' ): sp = spacy.load('en_core_web_sm') tokenized = sp(doc) sentences = [] for token in tokenized.sents: sentences.append(token.text) if encoder in ['bert', 'xlnet', 'longformer', 'reformer', 'distilbert', 'roberta', 'bart']: # Use encoder for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name, tokenizer_args= model_params['tokenizer_args'] if 'tokenizer_args' in model_params else {}) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) sentence_embeddings = model.encode(sentences) elif encoder == 'use': #!pip install embedding-as-service from embedding_as_service.text.encode import Encoder en = Encoder(embedding='use', model='use_dan', max_seq_length=256) sentence_embeddings = en.encode(texts=sentences) elif encoder == 'infersent': import nltk nltk.download('punkt') from models import InferSent params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2} infersent = InferSent(params_model) W2V_PATH = 'drive/My Drive/wiki-news-300d-1M.vec' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(sentences, tokenize=True) sentence_embeddings = infersent.encode(sentences, tokenize=True) elif encoder == 'sent2vec': import sent2vec model = sent2vec.Sent2vecModel() model.load_model('drive/My Drive/torontobooks_unigram.bin') sentence_embeddings = model.embed_sentences(sentences) elif encoder == 'laser': from laserembeddings import Laser laser = Laser() ## Also used for multilingual sentence embeddings sentence_embeddings = laser.embed_sentences(sentences, lang='en') else: raise ValueError('Invalid encoder {} or encoder Unavailable.'.format(encoder)) return list(zip(sentences, sentence_embeddings))
def __init__(self, mode="train"): self.mode = mode self.src = None self.tgt = None self.scores = None self.df = None self.laser = Laser()
def test_laser(): with open(Laser.DEFAULT_ENCODER_FILE, 'rb') as f_encoder: laser = Laser( Laser.DEFAULT_BPE_CODES_FILE, None, f_encoder, ) assert laser.embed_sentences( ['hello world!', 'i hope the tests are passing'], lang='en').shape == (2, 1024)
def __init__(self): if Singletons.__instance is not None: raise Exception("The singleton is already initialized you are attempting to initialize it again get lost") else: logger.info("Initializing Laser embedder") self.laser_embedder = Laser() self.cached_lq_dims = {} self.cached_intro_dims = {} # logger.info("Initializing Roberta embedder") # self.robert_embedder = SentenceTransformer(constants.fetch_constant("robeta_path")) Singletons.__instance = self
def transform_sentences(_sent_map): """ Builds sentence embeddings using the LASER model. :param _df: Input data frame with column of sentences. :return: Torch matrix of embeddings, size 1024. """ laser = Laser() sentences = list(_sent_map.keys()) _sent_embs = laser.embed_sentences(sentences, lang='en') _sent_tensors = [torch.from_numpy(j) for j in _sent_embs] return torch.stack(_sent_tensors)
def post_init(self): """ creates Laser object to be used to create the embedding during encode """ try: self.laser = Laser(bpe_codes=self._path_to_bpe_codes, bpe_vocab=self._path_to_bpe_vocab, encoder=self._path_to_encoder) except Exception as exp: self.logger.error( f'Got the following exception while instantiating Laser model {exp}' )
def laser_classifier(x_train, y_train, x_test, y_test): laser = Laser() train_vectors = [ laser.embed_sentences([text], lang='ar') for text in x_train ] test_vectors = [ laser.embed_sentences([text], lang='ar') for text in x_test ] train_vectors = [np.concatenate(x) for x in train_vectors] test_vectors = [np.concatenate(x) for x in test_vectors] classifier = SVC(random_state=0).fit(train_vectors, y_train) preds = classifier.predict(test_vectors) print(f'Accuracy score: {accuracy_score(preds, y_test).round(2)}')
class Singletons: __instance = None laser_embedder = None @staticmethod def get_instance(): """Static access method""" if Singletons.__instance is None: logger.info("Calling private constructor for embedder initialization ") Singletons() return Singletons.__instance def __init__(self): if Singletons.__instance is not None: raise Exception("The singleton is already initialized you are attempting to initialize it again get lost") else: logger.info("Initializing Laser embedder") self.laser_embedder = Laser() Singletons.__instance = self def perform_embeddings(self, all_sentences): """ This method embeds all the sentences passed using Laser embedder :param all_sentences: :return: list of sentence embeddings """ if self.laser_embedder is not None: sentence_embeddings = self.laser_embedder.embed_sentences(all_sentences, ["en"] * len(all_sentences)) return sentence_embeddings else: logger.info("the embedder is not set please restart the service")
def __init__(self, method: str = 'muse', path_to_model: str = './models/muse/'): assert method in self.__valid_methods, \ f'Expected method aliases: {self.__valid_methods}' self.method = method if self.method == 'muse': self.__vectorizer = hub.load(path_to_model) elif self.method == 'use': self.__vectorizer = hub.load(path_to_model) elif self.method == 'laser': self.__vectorizer = Laser() else: self.__vectorizer = None
def test_similarity(test_data): if not SIMILARITY_TEST: pytest.skip("SIMILARITY_TEST not set") if not test_data: raise FileNotFoundError( 'laserembeddings-test-data.npz is missing, run "python -m laserembeddings download-test-data" to fix that 🔧' ) report = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'report', 'comparison-with-LASER.md') laser = Laser() with open(report, 'w', encoding='utf-8') as f_report: f_report.write( '# Comparison of the embeddings computed with original LASER with the embeddings computed with this package\n' ) f_report.write( '| |language|avg. cosine similarity|min. cosine similarity|\n') f_report.write( '|-|--------|----------------------|----------------------|\n') for lang in test_data['langs']: if lang in ('cmn', 'wuu', 'yue', 'zh', 'jpn', 'ja', 'el'): # language not supported, ignoring continue sents = test_data[f'{lang}_sentences'] orig_embeddings = test_data[f'{lang}_embeddings'] embeddings = laser.embed_sentences(sents, lang) assert embeddings.shape == orig_embeddings.shape cosine_similarities = np.sum( orig_embeddings * embeddings, axis=1) / (np.linalg.norm(orig_embeddings, axis=1) * np.linalg.norm(embeddings, axis=1)) similarity_mean = np.mean(cosine_similarities) similarity_min = np.min(cosine_similarities) f_report.write( f'|{"✅" if similarity_min > 0.99999 else "⚠️" if similarity_mean > 0.99 else "❌"}|{lang}|{similarity_mean:.5f}|{similarity_min:.5f}|\n' )
class LaserEncoder(BaseTextEncoder): def __init__(self, path_to_bpe_codes: str = Laser.DEFAULT_BPE_CODES_FILE, path_to_bpe_vocab: str = Laser.DEFAULT_BPE_VOCAB_FILE, path_to_encoder: str = Laser.DEFAULT_ENCODER_FILE, language: str = 'en', *args, **kwargs): """ Encoder for language-agnostic sentence representations (Laser) from Facebook research (https://github.com/facebookresearch/LASER) :param path_to_bpe_codes: path to bpe codes from Laser. Defaults to Laser.DEFAULT_BPE_CODES_FILE. :param path_to_bpe_vocab: path to bpe vocabs from Laser. Defaults to Laser.DEFAULT_BPE_VOCAB_FILE. :param path_to_encoder: path to the encoder from Laser. Defaults to Laser.DEFAULT_ENCODER_FILE. :param language: language to be passed whie creating the embedding. Defaults to en. """ if not Path(path_to_bpe_codes): self.logger.error(f'bpe code file {path_to_bpe_codes} not found') else: self._path_to_bpe_codes = path_to_bpe_codes if not Path(path_to_bpe_vocab): self.logger.error(f'bpe vocab file {path_to_bpe_vocab} not found') else: self._path_to_bpe_vocab = path_to_bpe_vocab if not Path(path_to_encoder): self._logger.error(f'encode file {path_to_encoder} not found') else: self._path_to_encoder = path_to_encoder self.language = language super().__init__(*args, **kwargs) def post_init(self): """ creates Laser object to be used to create the embedding during encode """ try: self.laser = Laser(bpe_codes=self._path_to_bpe_codes, bpe_vocab=self._path_to_bpe_vocab, encoder=self._path_to_encoder) except Exception as exp: self.logger.error( f'Got the following exception while instantiating Laser model {exp}' ) @batching @as_ndarray def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray': """ :param data: a 1d array of string type in size `B` :return: an ndarray in size `B x D` (D=1024) """ output = self.laser.embed_sentences(sentences=data, lang=self.language) return output
class LaserEncoder(BaseTorchEncoder): """ Encode an array of string in size `B` into an ndarray in size `B x D` The ndarray potentially is BatchSize x (Channel x Height x Width) :class:`LaserEncoder` is a encoder based on Facebook Research's LASER (Language-Agnostic SEntence Representations) to compute multilingual sentence embeddings: https://github.com/facebookresearch/LASER :param path_to_bpe_codes: path to bpe codes from Laser. Defaults to Laser.DEFAULT_BPE_CODES_FILE. :param path_to_bpe_vocab: path to bpe vocabs from Laser. Defaults to Laser.DEFAULT_BPE_VOCAB_FILE. :param path_to_encoder: path to the encoder from Laser. Defaults to Laser.DEFAULT_ENCODER_FILE. :param language: language of the text. Defaults to english(en). :param args: Additional positional arguments :param kwargs: Additional keyword arguments """ def __init__( self, path_to_bpe_codes: str = None, path_to_bpe_vocab: str = None, path_to_encoder: str = None, language: str = 'en', *args, **kwargs, ): super().__init__(*args, **kwargs) from laserembeddings import Laser self._path_to_bpe_codes = path_to_bpe_codes or Laser.DEFAULT_BPE_CODES_FILE self._path_to_bpe_vocab = path_to_bpe_vocab or Laser.DEFAULT_BPE_VOCAB_FILE self._path_to_encoder = path_to_encoder or Laser.DEFAULT_ENCODER_FILE self.language = language.lower() def post_init(self): """Load LaserEncoder model""" from laserembeddings import Laser self.model = Laser( bpe_codes=self._path_to_bpe_codes, bpe_vocab=self._path_to_bpe_vocab, encoder=self._path_to_encoder, ) self.to_device(self.model.bpeSentenceEmbedding.encoder.encoder) @batching @as_ndarray def encode(self, data: "np.ndarray", *args, **kwargs) -> "np.ndarray": """ Encode data into an ndarray in size `B x D`. B is the `Batch size` and `D` the dimension. :param data: a 1d array of string type in size `B` :param args: Additional positional arguments :param kwargs: Additional keyword arguments :return: an ndarray in size `B x D`. """ return self.model.embed_sentences(data, lang=self.language)
class LASEREmbedder(Embedder): def __init__(self, tokenizer_language): super().__init__() self.laser = Laser() self.tokenizer_language = tokenizer_language def embed(self, sentence): return self.laser.embed_sentences(sentence, self.tokenizer_language)[0]
def get_vectors(strings): languages = [] for string in strings: languages.append(classify(string)[0]) corpus = [string.lower() for string in strings] corpus = [" ".join(string.splitlines()) for string in corpus] corpus = [re.sub(r'\W+', ' ', string) for string in corpus] return Laser().embed_sentences(corpus, lang=languages)
class AppConfig(AppConfig): name = "semantic_similarity" laser = Laser() model = hub.load(settings.USE_MODULE_URL) df = pd.read_csv( os.path.join(settings.ROOT_DIR, settings.SEMANTIC_SIMILARITY_DATA_FN)) BASE_VECTORS_LOADED = np.load(os.path.join(settings.ROOT_DIR, settings.BASE_VECTORS_FN), allow_pickle=True) PROCESSED_DATA_LOADED = np.load(os.path.join(settings.ROOT_DIR, settings.PROCESSED_DATA_FN), allow_pickle=True)
class LaserVectorizer(TransformerMixin, BaseEstimator): def __init__(self): self.model = Laser(path_to_bpe_codes, path_to_bpe_vocab, path_to_encoder) print('Applying Laser Transform') def fit(self, X): return self def transform(self, X): x_laser = self.model.embed_sentences(X, lang='en') return x_laser
class LaserEncoder(BaseTorchEncoder): """ :class:`LaserEncoder` is a encoder based on Facebook Research's LASER (Language-Agnostic SEntence Representations) to compute multilingual sentence embeddings. It encodes data from an 1d array of string in size `B` into an ndarray in size `B x D`. https://github.com/facebookresearch/LASER """ def __init__( self, path_to_bpe_codes: str = None, path_to_bpe_vocab: str = None, path_to_encoder: str = None, language: str = 'en', *args, **kwargs, ): """ :param path_to_bpe_codes: path to bpe codes from Laser. Defaults to Laser.DEFAULT_BPE_CODES_FILE. :param path_to_bpe_vocab: path to bpe vocabs from Laser. Defaults to Laser.DEFAULT_BPE_VOCAB_FILE. :param path_to_encoder: path to the encoder from Laser. Defaults to Laser.DEFAULT_ENCODER_FILE. :param language: language of the text. Defaults to en. :param args: :param kwargs: """ super().__init__(*args, **kwargs) from laserembeddings import Laser self._path_to_bpe_codes = path_to_bpe_codes or Laser.DEFAULT_BPE_CODES_FILE self._path_to_bpe_vocab = path_to_bpe_vocab or Laser.DEFAULT_BPE_VOCAB_FILE self._path_to_encoder = path_to_encoder or Laser.DEFAULT_ENCODER_FILE self.language = language.lower() def post_init(self): from laserembeddings import Laser self.model = Laser( bpe_codes=self._path_to_bpe_codes, bpe_vocab=self._path_to_bpe_vocab, encoder=self._path_to_encoder, ) self.to_device(self.model.bpeSentenceEmbedding.encoder.encoder) @batching @as_ndarray def encode(self, data: "np.ndarray", *args, **kwargs) -> "np.ndarray": """ :param data: a 1d array of string type in size `B` :param args: :param kwargs: :return: an ndarray in size `B x D` """ return self.model.embed_sentences(data, lang=self.language)
class Vectorizer(object): """ Encoding/Vectorization of text wrapper for various models. @:param method: str, optional (default: 'muse'); alias of the encoding/vectorization method to use - 'use' - Universal Sentence Encoder (https://tfhub.dev/google/universal-sentence-encoder/4) - 'muse' - Multilingual Universal Sentence Encoder (https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3) - 'laser' - Language-Agnostic SEntence Representations (https://github.com/facebookresearch/LASER) @:param path_to_model: str, optional (default: './models/muse/'); path to models (not needed for LASER; in case of tf-hub models, the parameter may either contain a link or the path to a locally saved model) """ __valid_methods = ['muse', 'laser', 'use'] def __init__(self, method: str = 'muse', path_to_model: str = './models/muse/'): assert method in self.__valid_methods, \ f'Expected method aliases: {self.__valid_methods}' self.method = method if self.method == 'muse': self.__vectorizer = hub.load(path_to_model) elif self.method == 'use': self.__vectorizer = hub.load(path_to_model) elif self.method == 'laser': self.__vectorizer = Laser() else: self.__vectorizer = None def vectorize(self, docs: List[str], **kwargs) -> List[List[float]]: if self.method in {'muse', 'use'}: result = self.__vectorizer(docs).numpy().tolist() elif self.method == 'laser': result = self.__vectorizer.embed_sentences(docs, **kwargs).tolist() else: raise ValueError(f'Method {self.method} is not available') return result
def encode_documents_laser(documents, params, tokenizer=None): max_input_length = params['max_length'] laser = Laser() output = torch.zeros(size=(len(documents), params['max_sentences_per_doc'], 3, 1024), dtype=torch.float) for doc_index, tokenized_document in tqdm(enumerate(documents)): lang_list = [] for ele in tokenized_document: try: lang_list.append(detect(ele)) except: lang_list.append('en') embeddings = laser.embed_sentences( tokenized_document, lang=lang_list) # lang is only used for tokenization for seq_index, embed in enumerate(embeddings): if (seq_index >= params['max_sentences_per_doc']): continue output[doc_index][seq_index][0] = torch.FloatTensor(embed) return output
def test_laser(): with open(Laser.DEFAULT_ENCODER_FILE, 'rb') as f_encoder: laser = Laser( Laser.DEFAULT_BPE_CODES_FILE, None, f_encoder, ) assert laser.embed_sentences( ['hello world!', 'i hope the tests are passing'], lang='en').shape == (2, 1024) assert laser.embed_sentences(['hello world!', "j'aime les pâtes"], lang=['en', 'fr']).shape == (2, 1024) assert laser.embed_sentences('hello world!', lang='en').shape == (1, 1024) with pytest.raises(ValueError): laser.embed_sentences(['hello world!', "j'aime les pâtes"], lang=['en'])
def prep_laser( en_x: List[str], es_x: List[str], cm_x: List[str], test_x: List[str]) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray): """ Args: en_x: es_x: cm_x: test_x: Returns: en_x, es_x, cm_x, test_x """ laser = Laser() en_x = get_laser_embeddings(en_x, "en", laser) es_x = get_laser_embeddings(es_x, "es", laser) cm_x = get_laser_embeddings(cm_x, "en", laser) test_x = get_laser_embeddings(test_x, "en", laser) return en_x, es_x, cm_x, test_x
def run_laser_sts_experiment(cleaning, batch_size=8, random_seed=777): df = concatenate("sts_data") list_1 = df['text_a'].tolist() list_2 = df['text_b'].tolist() list_1_embeddings = [] list_2_embeddings = [] laser = Laser() if cleaning: cleaned_list_1 = [clean_arabic(item) for item in list_1] cleaned_list_2 = [clean_arabic(item) for item in list_2] for x in tqdm(batch(cleaned_list_1, batch_size)): list_1_embeddings.extend(laser.embed_sentences(x, lang='ar')) print("Length of the list 1 embeddings {}".format(str(len(list_1_embeddings)))) for x in tqdm(batch(cleaned_list_2, batch_size)): list_2_embeddings.extend(laser.embed_sentences(x, lang='ar')) print("Length of the list 2 embeddings {}".format(str(len(list_2_embeddings)))) else: for x in tqdm(batch(list_1, batch_size)): list_1_embeddings.extend(laser.embed_sentences(x, lang='ar')) print("Length of the list 1 embeddings {}".format(str(len(list_1_embeddings)))) for x in tqdm(batch(list_2, batch_size)): list_2_embeddings.extend(laser.embed_sentences(x, lang='ar')) print("Length of the list 2 embeddings {}".format(str(len(list_2_embeddings)))) predicted_similrities = [] similarities = df['labels'].tolist() for embedding_1, embedding_2 in tqdm(zip(list_1_embeddings, list_2_embeddings)): cos_sim = dot(embedding_1, embedding_2) / (norm(embedding_1) * norm(embedding_2)) predicted_similrities.append(cos_sim) print("Pearson Coorelation - {}".format(str(pearsonr(similarities, predicted_similrities)[0])))
def initialize_laser(): os.system("python -m laserembeddings download-models") laser = Laser() return laser
import string import time import smbclient from environment import MODE from whatlangid import WhatLangId from sentence_transformers import SentenceTransformer if MODE == 'local': from .local_constants import * else: from .dev_constants import * from .models import * # Initialize Laser laser = Laser(path_to_bpe_codes, path_to_bpe_vocab, path_to_encoder) # Initialize Labse embedding labse_model = SentenceTransformer(labse_location) # Initialize language detector language_model = WhatLangId(custom_model=whatlangid_model) lang_detect = language_model.predict_lang # lang_detect = classify class base: def __init__(self, mode=None, model=None): self.mode = mode self.model = model
from elasticsearch import Elasticsearch from bert_serving.client import BertClient import json from laserembeddings import Laser import sys __author__ = "Bijin Benny" __email__ = "*****@*****.**" __license__ = "MIT" __version__ = "1.0" LASER = 'laser_vector' BERT = 'bert_vector' laser = Laser() #Elasticsearch DB client es = Elasticsearch(hosts="http://*****:*****@localhost:9200/") #Client connection to local BERT server bc = BertClient(ip='localhost', output_fmt='list') """ doVectorize() pulls entries from the database and maps the text sequences into the vector space using either one of LASER or BERT based on the input parameter. BERT produces 768 dimensional vector while LASER outputs a 1024 dimensional vector Argument : vector_type (String) --> bert_vector or laser_vector """ def doVectorize(vector_type): """
class FeatureExtractor: def __init__(self, mode="train"): self.mode = mode self.src = None self.tgt = None self.scores = None self.df = None self.laser = Laser() def load_data(self): # Base df with three columns path = f"en-de/{self.mode}.ende" src = pd.read_csv( f"{path}.src", sep="\n", error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None, ) target = pd.read_csv( f"{path}.mt", sep="\n", error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None, ) df = src.rename(columns={0: "src"}) if self.mode != "test": scores = pd.read_csv( f"{path}.scores", sep="\n", error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None, ) df["scores"] = scores else: df["scores"] = [0 for _ in range(len(target)) ] # just placeholder, not used for test df["tgt"] = target setattr(self, "df", df) return df def laser_embeddings(self): """Extract laser embeddings and reshape appropriately.""" src = self.laser.embed_sentences(self.df["src"].tolist(), lang="en") # (N, 1024) tgt = self.laser.embed_sentences(self.df["tgt"].tolist(), lang="de") # (N, 1024) res = np.zeros((src.shape[0], 2, 1024)) # (N, 2, 1024) ndarray res[:, 0, :] = src res[:, 1, :] = tgt # Standardize scores res = MinMaxScaler().fit_transform(res) return res def features(self): """Extract baseline features""" sp_en = spacy.load("en") sp_de = spacy.load("de") en_checker = language_check.LanguageTool("en-GB") ge_checker = language_check.LanguageTool("de-DE") ft = self.df.copy() # Sentences without punctuation ft[["src_p", "tgt_p"]] = ft[["src", "tgt"]].applymap(lambda x: x.lower( ).translate(str.maketrans("", "", string.punctuation))) # Number of tokens ft["src_len"] = ft["src_p"].apply(lambda x: len(x.split(" "))) ft["tgt_len"] = ft["tgt_p"].apply(lambda x: len(x.split(" "))) count = lambda l1, l2: sum([1 for x in l1 if x in l2]) # Number of non alphanumeric characters ft["src_#punc"] = ft["src"].apply( lambda x: count(x, set(string.punctuation))) ft["tgt_#punc"] = ft["tgt"].apply( lambda x: count(x, set(string.punctuation))) # Sentiment analysis ft["tgt_polar"] = ft["tgt"].apply(lambda x: TBD(x).sentiment.polarity) ft["src_polar"] = ft["src"].apply(lambda x: TBE(x).sentiment.polarity) ft["polar_ftf"] = (ft["tgt_polar"] - ft["src_polar"]).abs() # Spacy encoding ft["src_sp"] = ft["src"].apply(lambda x: sp_en(x)) ft["tgt_sp"] = ft["tgt"].apply(lambda x: sp_de(x)) # Proofread errors ft["sp_pos_diff"] = [ spacy_parser(x, y, "pos_") for x, y in zip(ft["src_sp"], ft["tgt_sp"]) ] ft["sp_ent_diff"] = [ spacy_parser(x, y, "ents") for x, y in zip(ft["src_sp"], ft["tgt_sp"]) ] ft["src_gram_err"] = ft["src"].apply( lambda x: len(en_checker.check(x))) ft["tgt_gram_err"] = ft["tgt"].apply( lambda x: len(ge_checker.check(x))) # Features of interest foi = [ "src_len", "tgt_len", "src_#punc", "tgt_#punc", "tgt_polar", "src_polar", "src_gram_err", "tgt_gram_err", "sp_pos_diff", "sp_ent_diff", ] # Features of interest features = ft[foi].values normalized_features = MinMaxScaler().fit_transform(features) return features def run(self): """Run feature extraction pipeline.""" print("Loading data") self.load_data() print("Extracting Laser Embeddings") laser_embeds = self.laser_embeddings() print(f"Laser features extracted, shape: {laser_embeds.shape}") print("Extracting NLP features") features = self.features() print(f"NLP features extracted, shape: {features.shape}") res = namedtuple("res", ["lsr", "feats", "scores"])( lsr=laser_embeds, feats=features, scores=self.df["scores"].values) return res
from laserembeddings import Laser laser = Laser() # if all sentences are in the same language: # embeddings = laser.embed_sentences( # ['let your neural network be polyglot', # 'use multilingual embeddings!'], # lang='en') # lang is only used for tokenization # # print ('') from problem_util_yr.loadDict.read_json_tool import read_json gene = read_json('./title_key_5w.json') ll = [] ii = 0 allpkl = [] for d in gene: ii += 1 if len(ll) < 10: ll.append(' '.join(d['title'])) else: embeddings = laser.embed_sentences(ll, lang='en') allpkl.append([ll, embeddings]) ll = [] ### if ii > 10000: break ### import pandas as pdd