def test_laser(): with open(Laser.DEFAULT_ENCODER_FILE, 'rb') as f_encoder: laser = Laser( Laser.DEFAULT_BPE_CODES_FILE, None, f_encoder, ) assert laser.embed_sentences( ['hello world!', 'i hope the tests are passing'], lang='en').shape == (2, 1024) assert laser.embed_sentences(['hello world!', "j'aime les pâtes"], lang=['en', 'fr']).shape == (2, 1024) assert laser.embed_sentences('hello world!', lang='en').shape == (1, 1024)
def laser_classifier(x_train, y_train, x_test, y_test): laser = Laser() train_vectors = [ laser.embed_sentences([text], lang='ar') for text in x_train ] test_vectors = [ laser.embed_sentences([text], lang='ar') for text in x_test ] train_vectors = [np.concatenate(x) for x in train_vectors] test_vectors = [np.concatenate(x) for x in test_vectors] classifier = SVC(random_state=0).fit(train_vectors, y_train) preds = classifier.predict(test_vectors) print(f'Accuracy score: {accuracy_score(preds, y_test).round(2)}')
class Singletons: __instance = None laser_embedder = None @staticmethod def get_instance(): """Static access method""" if Singletons.__instance is None: logger.info("Calling private constructor for embedder initialization ") Singletons() return Singletons.__instance def __init__(self): if Singletons.__instance is not None: raise Exception("The singleton is already initialized you are attempting to initialize it again get lost") else: logger.info("Initializing Laser embedder") self.laser_embedder = Laser() Singletons.__instance = self def perform_embeddings(self, all_sentences): """ This method embeds all the sentences passed using Laser embedder :param all_sentences: :return: list of sentence embeddings """ if self.laser_embedder is not None: sentence_embeddings = self.laser_embedder.embed_sentences(all_sentences, ["en"] * len(all_sentences)) return sentence_embeddings else: logger.info("the embedder is not set please restart the service")
class LaserEncoder(BaseTorchEncoder): """ Encode an array of string in size `B` into an ndarray in size `B x D` The ndarray potentially is BatchSize x (Channel x Height x Width) :class:`LaserEncoder` is a encoder based on Facebook Research's LASER (Language-Agnostic SEntence Representations) to compute multilingual sentence embeddings: https://github.com/facebookresearch/LASER :param path_to_bpe_codes: path to bpe codes from Laser. Defaults to Laser.DEFAULT_BPE_CODES_FILE. :param path_to_bpe_vocab: path to bpe vocabs from Laser. Defaults to Laser.DEFAULT_BPE_VOCAB_FILE. :param path_to_encoder: path to the encoder from Laser. Defaults to Laser.DEFAULT_ENCODER_FILE. :param language: language of the text. Defaults to english(en). :param args: Additional positional arguments :param kwargs: Additional keyword arguments """ def __init__( self, path_to_bpe_codes: str = None, path_to_bpe_vocab: str = None, path_to_encoder: str = None, language: str = 'en', *args, **kwargs, ): super().__init__(*args, **kwargs) from laserembeddings import Laser self._path_to_bpe_codes = path_to_bpe_codes or Laser.DEFAULT_BPE_CODES_FILE self._path_to_bpe_vocab = path_to_bpe_vocab or Laser.DEFAULT_BPE_VOCAB_FILE self._path_to_encoder = path_to_encoder or Laser.DEFAULT_ENCODER_FILE self.language = language.lower() def post_init(self): """Load LaserEncoder model""" from laserembeddings import Laser self.model = Laser( bpe_codes=self._path_to_bpe_codes, bpe_vocab=self._path_to_bpe_vocab, encoder=self._path_to_encoder, ) self.to_device(self.model.bpeSentenceEmbedding.encoder.encoder) @batching @as_ndarray def encode(self, data: "np.ndarray", *args, **kwargs) -> "np.ndarray": """ Encode data into an ndarray in size `B x D`. B is the `Batch size` and `D` the dimension. :param data: a 1d array of string type in size `B` :param args: Additional positional arguments :param kwargs: Additional keyword arguments :return: an ndarray in size `B x D`. """ return self.model.embed_sentences(data, lang=self.language)
class LaserEncoder(BaseTextEncoder): def __init__(self, path_to_bpe_codes: str = Laser.DEFAULT_BPE_CODES_FILE, path_to_bpe_vocab: str = Laser.DEFAULT_BPE_VOCAB_FILE, path_to_encoder: str = Laser.DEFAULT_ENCODER_FILE, language: str = 'en', *args, **kwargs): """ Encoder for language-agnostic sentence representations (Laser) from Facebook research (https://github.com/facebookresearch/LASER) :param path_to_bpe_codes: path to bpe codes from Laser. Defaults to Laser.DEFAULT_BPE_CODES_FILE. :param path_to_bpe_vocab: path to bpe vocabs from Laser. Defaults to Laser.DEFAULT_BPE_VOCAB_FILE. :param path_to_encoder: path to the encoder from Laser. Defaults to Laser.DEFAULT_ENCODER_FILE. :param language: language to be passed whie creating the embedding. Defaults to en. """ if not Path(path_to_bpe_codes): self.logger.error(f'bpe code file {path_to_bpe_codes} not found') else: self._path_to_bpe_codes = path_to_bpe_codes if not Path(path_to_bpe_vocab): self.logger.error(f'bpe vocab file {path_to_bpe_vocab} not found') else: self._path_to_bpe_vocab = path_to_bpe_vocab if not Path(path_to_encoder): self._logger.error(f'encode file {path_to_encoder} not found') else: self._path_to_encoder = path_to_encoder self.language = language super().__init__(*args, **kwargs) def post_init(self): """ creates Laser object to be used to create the embedding during encode """ try: self.laser = Laser(bpe_codes=self._path_to_bpe_codes, bpe_vocab=self._path_to_bpe_vocab, encoder=self._path_to_encoder) except Exception as exp: self.logger.error( f'Got the following exception while instantiating Laser model {exp}' ) @batching @as_ndarray def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray': """ :param data: a 1d array of string type in size `B` :return: an ndarray in size `B x D` (D=1024) """ output = self.laser.embed_sentences(sentences=data, lang=self.language) return output
class LASEREmbedder(Embedder): def __init__(self, tokenizer_language): super().__init__() self.laser = Laser() self.tokenizer_language = tokenizer_language def embed(self, sentence): return self.laser.embed_sentences(sentence, self.tokenizer_language)[0]
def getSentenceVector(doc, model_params: dict = {}, encoder = "distilbert", model_name = 'distilbert-base-nli-mean-tokens' ): sp = spacy.load('en_core_web_sm') tokenized = sp(doc) sentences = [] for token in tokenized.sents: sentences.append(token.text) if encoder in ['bert', 'xlnet', 'longformer', 'reformer', 'distilbert', 'roberta', 'bart']: # Use encoder for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name, tokenizer_args= model_params['tokenizer_args'] if 'tokenizer_args' in model_params else {}) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) sentence_embeddings = model.encode(sentences) elif encoder == 'use': #!pip install embedding-as-service from embedding_as_service.text.encode import Encoder en = Encoder(embedding='use', model='use_dan', max_seq_length=256) sentence_embeddings = en.encode(texts=sentences) elif encoder == 'infersent': import nltk nltk.download('punkt') from models import InferSent params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2} infersent = InferSent(params_model) W2V_PATH = 'drive/My Drive/wiki-news-300d-1M.vec' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(sentences, tokenize=True) sentence_embeddings = infersent.encode(sentences, tokenize=True) elif encoder == 'sent2vec': import sent2vec model = sent2vec.Sent2vecModel() model.load_model('drive/My Drive/torontobooks_unigram.bin') sentence_embeddings = model.embed_sentences(sentences) elif encoder == 'laser': from laserembeddings import Laser laser = Laser() ## Also used for multilingual sentence embeddings sentence_embeddings = laser.embed_sentences(sentences, lang='en') else: raise ValueError('Invalid encoder {} or encoder Unavailable.'.format(encoder)) return list(zip(sentences, sentence_embeddings))
def run_laser_sts_experiment(cleaning, batch_size=8, random_seed=777): df = concatenate("sts_data") list_1 = df['text_a'].tolist() list_2 = df['text_b'].tolist() list_1_embeddings = [] list_2_embeddings = [] laser = Laser() if cleaning: cleaned_list_1 = [clean_arabic(item) for item in list_1] cleaned_list_2 = [clean_arabic(item) for item in list_2] for x in tqdm(batch(cleaned_list_1, batch_size)): list_1_embeddings.extend(laser.embed_sentences(x, lang='ar')) print("Length of the list 1 embeddings {}".format(str(len(list_1_embeddings)))) for x in tqdm(batch(cleaned_list_2, batch_size)): list_2_embeddings.extend(laser.embed_sentences(x, lang='ar')) print("Length of the list 2 embeddings {}".format(str(len(list_2_embeddings)))) else: for x in tqdm(batch(list_1, batch_size)): list_1_embeddings.extend(laser.embed_sentences(x, lang='ar')) print("Length of the list 1 embeddings {}".format(str(len(list_1_embeddings)))) for x in tqdm(batch(list_2, batch_size)): list_2_embeddings.extend(laser.embed_sentences(x, lang='ar')) print("Length of the list 2 embeddings {}".format(str(len(list_2_embeddings)))) predicted_similrities = [] similarities = df['labels'].tolist() for embedding_1, embedding_2 in tqdm(zip(list_1_embeddings, list_2_embeddings)): cos_sim = dot(embedding_1, embedding_2) / (norm(embedding_1) * norm(embedding_2)) predicted_similrities.append(cos_sim) print("Pearson Coorelation - {}".format(str(pearsonr(similarities, predicted_similrities)[0])))
class LaserVectorizer(TransformerMixin, BaseEstimator): def __init__(self): self.model = Laser(path_to_bpe_codes, path_to_bpe_vocab, path_to_encoder) print('Applying Laser Transform') def fit(self, X): return self def transform(self, X): x_laser = self.model.embed_sentences(X, lang='en') return x_laser
def transform_sentences(_sent_map): """ Builds sentence embeddings using the LASER model. :param _df: Input data frame with column of sentences. :return: Torch matrix of embeddings, size 1024. """ laser = Laser() sentences = list(_sent_map.keys()) _sent_embs = laser.embed_sentences(sentences, lang='en') _sent_tensors = [torch.from_numpy(j) for j in _sent_embs] return torch.stack(_sent_tensors)
class LaserEncoder(BaseTorchEncoder): """ :class:`LaserEncoder` is a encoder based on Facebook Research's LASER (Language-Agnostic SEntence Representations) to compute multilingual sentence embeddings. It encodes data from an 1d array of string in size `B` into an ndarray in size `B x D`. https://github.com/facebookresearch/LASER """ def __init__( self, path_to_bpe_codes: str = None, path_to_bpe_vocab: str = None, path_to_encoder: str = None, language: str = 'en', *args, **kwargs, ): """ :param path_to_bpe_codes: path to bpe codes from Laser. Defaults to Laser.DEFAULT_BPE_CODES_FILE. :param path_to_bpe_vocab: path to bpe vocabs from Laser. Defaults to Laser.DEFAULT_BPE_VOCAB_FILE. :param path_to_encoder: path to the encoder from Laser. Defaults to Laser.DEFAULT_ENCODER_FILE. :param language: language of the text. Defaults to en. :param args: :param kwargs: """ super().__init__(*args, **kwargs) from laserembeddings import Laser self._path_to_bpe_codes = path_to_bpe_codes or Laser.DEFAULT_BPE_CODES_FILE self._path_to_bpe_vocab = path_to_bpe_vocab or Laser.DEFAULT_BPE_VOCAB_FILE self._path_to_encoder = path_to_encoder or Laser.DEFAULT_ENCODER_FILE self.language = language.lower() def post_init(self): from laserembeddings import Laser self.model = Laser( bpe_codes=self._path_to_bpe_codes, bpe_vocab=self._path_to_bpe_vocab, encoder=self._path_to_encoder, ) self.to_device(self.model.bpeSentenceEmbedding.encoder.encoder) @batching @as_ndarray def encode(self, data: "np.ndarray", *args, **kwargs) -> "np.ndarray": """ :param data: a 1d array of string type in size `B` :param args: :param kwargs: :return: an ndarray in size `B x D` """ return self.model.embed_sentences(data, lang=self.language)
class Vectorizer(object): """ Encoding/Vectorization of text wrapper for various models. @:param method: str, optional (default: 'muse'); alias of the encoding/vectorization method to use - 'use' - Universal Sentence Encoder (https://tfhub.dev/google/universal-sentence-encoder/4) - 'muse' - Multilingual Universal Sentence Encoder (https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3) - 'laser' - Language-Agnostic SEntence Representations (https://github.com/facebookresearch/LASER) @:param path_to_model: str, optional (default: './models/muse/'); path to models (not needed for LASER; in case of tf-hub models, the parameter may either contain a link or the path to a locally saved model) """ __valid_methods = ['muse', 'laser', 'use'] def __init__(self, method: str = 'muse', path_to_model: str = './models/muse/'): assert method in self.__valid_methods, \ f'Expected method aliases: {self.__valid_methods}' self.method = method if self.method == 'muse': self.__vectorizer = hub.load(path_to_model) elif self.method == 'use': self.__vectorizer = hub.load(path_to_model) elif self.method == 'laser': self.__vectorizer = Laser() else: self.__vectorizer = None def vectorize(self, docs: List[str], **kwargs) -> List[List[float]]: if self.method in {'muse', 'use'}: result = self.__vectorizer(docs).numpy().tolist() elif self.method == 'laser': result = self.__vectorizer.embed_sentences(docs, **kwargs).tolist() else: raise ValueError(f'Method {self.method} is not available') return result
def test_similarity(test_data): if not SIMILARITY_TEST: pytest.skip("SIMILARITY_TEST not set") if not test_data: raise FileNotFoundError( 'laserembeddings-test-data.npz is missing, run "python -m laserembeddings download-test-data" to fix that 🔧' ) report = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'report', 'comparison-with-LASER.md') laser = Laser() with open(report, 'w', encoding='utf-8') as f_report: f_report.write( '# Comparison of the embeddings computed with original LASER with the embeddings computed with this package\n' ) f_report.write( '| |language|avg. cosine similarity|min. cosine similarity|\n') f_report.write( '|-|--------|----------------------|----------------------|\n') for lang in test_data['langs']: if lang in ('cmn', 'wuu', 'yue', 'zh', 'jpn', 'ja', 'el'): # language not supported, ignoring continue sents = test_data[f'{lang}_sentences'] orig_embeddings = test_data[f'{lang}_embeddings'] embeddings = laser.embed_sentences(sents, lang) assert embeddings.shape == orig_embeddings.shape cosine_similarities = np.sum( orig_embeddings * embeddings, axis=1) / (np.linalg.norm(orig_embeddings, axis=1) * np.linalg.norm(embeddings, axis=1)) similarity_mean = np.mean(cosine_similarities) similarity_min = np.min(cosine_similarities) f_report.write( f'|{"✅" if similarity_min > 0.99999 else "⚠️" if similarity_mean > 0.99 else "❌"}|{lang}|{similarity_mean:.5f}|{similarity_min:.5f}|\n' )
def encode_documents_laser(documents, params, tokenizer=None): max_input_length = params['max_length'] laser = Laser() output = torch.zeros(size=(len(documents), params['max_sentences_per_doc'], 3, 1024), dtype=torch.float) for doc_index, tokenized_document in tqdm(enumerate(documents)): lang_list = [] for ele in tokenized_document: try: lang_list.append(detect(ele)) except: lang_list.append('en') embeddings = laser.embed_sentences( tokenized_document, lang=lang_list) # lang is only used for tokenization for seq_index, embed in enumerate(embeddings): if (seq_index >= params['max_sentences_per_doc']): continue output[doc_index][seq_index][0] = torch.FloatTensor(embed) return output
from numpy import dot from numpy.linalg import norm def cos_sim(a,b): cos_sim = dot(a, b)/(norm(a)*norm(b)) return cos_sim path_to_bpe_codes = '/home/darth.vader/laser/93langs.fcodes' path_to_bpe_vocab = '/home/darth.vader/laser/93langs.fvocab' path_to_encoder = '/home/darth.vader/laser/bilstm.93langs.2018-12-26.pt' laser = Laser(path_to_bpe_codes, path_to_bpe_vocab, path_to_encoder) emb = laser.embed_sentences( ['how are you'], lang='en') te_emb = laser.embed_sentences( ['क्या हाल है'], lang='hi') sim = cos_sim(emb[0],te_emb[0]) print(sim) import time import numpy as np import langid from ilmulti.segment import SimpleSegmenter, Segmenter from ilmulti.sentencepiece import SentencePieceTokenizer
class FeatureExtractor: def __init__(self, mode="train"): self.mode = mode self.src = None self.tgt = None self.scores = None self.df = None self.laser = Laser() def load_data(self): # Base df with three columns path = f"en-de/{self.mode}.ende" src = pd.read_csv( f"{path}.src", sep="\n", error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None, ) target = pd.read_csv( f"{path}.mt", sep="\n", error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None, ) df = src.rename(columns={0: "src"}) if self.mode != "test": scores = pd.read_csv( f"{path}.scores", sep="\n", error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None, ) df["scores"] = scores else: df["scores"] = [0 for _ in range(len(target)) ] # just placeholder, not used for test df["tgt"] = target setattr(self, "df", df) return df def laser_embeddings(self): """Extract laser embeddings and reshape appropriately.""" src = self.laser.embed_sentences(self.df["src"].tolist(), lang="en") # (N, 1024) tgt = self.laser.embed_sentences(self.df["tgt"].tolist(), lang="de") # (N, 1024) res = np.zeros((src.shape[0], 2, 1024)) # (N, 2, 1024) ndarray res[:, 0, :] = src res[:, 1, :] = tgt # Standardize scores res = MinMaxScaler().fit_transform(res) return res def features(self): """Extract baseline features""" sp_en = spacy.load("en") sp_de = spacy.load("de") en_checker = language_check.LanguageTool("en-GB") ge_checker = language_check.LanguageTool("de-DE") ft = self.df.copy() # Sentences without punctuation ft[["src_p", "tgt_p"]] = ft[["src", "tgt"]].applymap(lambda x: x.lower( ).translate(str.maketrans("", "", string.punctuation))) # Number of tokens ft["src_len"] = ft["src_p"].apply(lambda x: len(x.split(" "))) ft["tgt_len"] = ft["tgt_p"].apply(lambda x: len(x.split(" "))) count = lambda l1, l2: sum([1 for x in l1 if x in l2]) # Number of non alphanumeric characters ft["src_#punc"] = ft["src"].apply( lambda x: count(x, set(string.punctuation))) ft["tgt_#punc"] = ft["tgt"].apply( lambda x: count(x, set(string.punctuation))) # Sentiment analysis ft["tgt_polar"] = ft["tgt"].apply(lambda x: TBD(x).sentiment.polarity) ft["src_polar"] = ft["src"].apply(lambda x: TBE(x).sentiment.polarity) ft["polar_ftf"] = (ft["tgt_polar"] - ft["src_polar"]).abs() # Spacy encoding ft["src_sp"] = ft["src"].apply(lambda x: sp_en(x)) ft["tgt_sp"] = ft["tgt"].apply(lambda x: sp_de(x)) # Proofread errors ft["sp_pos_diff"] = [ spacy_parser(x, y, "pos_") for x, y in zip(ft["src_sp"], ft["tgt_sp"]) ] ft["sp_ent_diff"] = [ spacy_parser(x, y, "ents") for x, y in zip(ft["src_sp"], ft["tgt_sp"]) ] ft["src_gram_err"] = ft["src"].apply( lambda x: len(en_checker.check(x))) ft["tgt_gram_err"] = ft["tgt"].apply( lambda x: len(ge_checker.check(x))) # Features of interest foi = [ "src_len", "tgt_len", "src_#punc", "tgt_#punc", "tgt_polar", "src_polar", "src_gram_err", "tgt_gram_err", "sp_pos_diff", "sp_ent_diff", ] # Features of interest features = ft[foi].values normalized_features = MinMaxScaler().fit_transform(features) return features def run(self): """Run feature extraction pipeline.""" print("Loading data") self.load_data() print("Extracting Laser Embeddings") laser_embeds = self.laser_embeddings() print(f"Laser features extracted, shape: {laser_embeds.shape}") print("Extracting NLP features") features = self.features() print(f"NLP features extracted, shape: {features.shape}") res = namedtuple("res", ["lsr", "feats", "scores"])( lsr=laser_embeds, feats=features, scores=self.df["scores"].values) return res
class SentimentAnalyse(object): """ SentimentAnalyse generate a model to do sentiment snalyses on sentences laserembeddings is used as embedding keras is used to build model methods: generate_model -> train a model from reviews and save it load_model -> load a trained model model_predict -> take list of sentences and use model to predict sentiment """ def __init__(self, verbose=1): # load laserembeddings models # if they're missing, we're downloading them try: self.laser = Laser() except: if verbose > 0: print("WARNING laserembeddings models missing, downloading ...") os.system("python -m laserembeddings download-models") self.laser = Laser() # load reviews csv # if it's missing, we're generating it # it is generated with "generate_csv_from_reviews.py" file, who is based on reviews in "sorted_data" if not os.path.isfile("labeled_reviews.csv"): if verbose > 0: print("WARNING csv missing, generating ...") start_timer = time.time() generate_csv_from_reviews.generate_csv_from_reviews("labeled_reviews.csv") if verbose > 0: print("time to generate:", round(time.time() - start_timer, 2), "s") # load stopwords f = open("sorted_data/stopwords", "r") self.stopwords = f.read().split("\n") self.stopwords.pop(-1) self.df_reviews = pd.read_csv("labeled_reviews.csv") # initialise model as False so we know he isnt already loaded self.model = False if verbose > 0: print("SentimentAnalyse ready to use.") def _train_model(self, model, X, Y, path_model_save="model.h5", verbose=2): """ params: model : keras model -> model to train X : list of embedded sentence -> input of model (X) Y : list of int, sentiments of X -> output of model (y hat) path_model_save : str -> path to save the model verbose : int -> show progress if verbose > 0 return: model : keras model -> the trained model """ X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) # compile model model.compile( loss = tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5), metrics = [ tf.keras.metrics.BinaryAccuracy() ], ) # train model model.fit( X_train, Y_train, batch_size = 32, epochs = 1000, validation_split = 0.2, callbacks = [ tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10), tf.keras.callbacks.ModelCheckpoint(path_model_save, monitor='binary_accuracy', mode='max', verbose=0, save_best_only=True) ], verbose=verbose ) # show accuracy if verbose > 0: print() print('Train Accuracy') model.evaluate(X_train, Y_train) print('Test Accuracy') model.evaluate(X_test, Y_test) return model def preprocess_text_list(self, text_list, lang="en"): """ return preprocessed text_list """ # for i in tqdm(range(len(text_list))): # # strip "\n" # text_list[i] = text_list[i].strip("\n") # # lowercase # text_list[i] = text_list[i].lower() # # stop words # text_list[i] = ' '.join([word for word in text_list[i] if not word in self.stopwords]) print("embedding ...") # embedding sentences preprocess_text_list = self.laser.embed_sentences(text_list, lang=lang) return preprocess_text_list def generate_model(self, sample_size=2000, path_model_save="model.h5", verbose=2): """ params: sample_size : int -> number of sentences use to train model path_model_save : str -> path to save the model verbose : int -> show progress if verbose > 0 return: model : keras model -> trained model """ # loading data for training # the smaller the sample_size is, the faster the model is generated df_train = self.df_reviews[self.df_reviews["sentiment"] == 1].head(int(sample_size/2)) df_train = df_train.append(self.df_reviews[self.df_reviews["sentiment"] == 0].head(sample_size - int(sample_size/2))) df_train = df_train.reset_index(drop=True) # shuffle DataFrame df_train = df_train.sample(frac=1).reset_index(drop=True) if verbose > 0: print("Train data successfully loaded") display.display(df_train.head()) print(df_train["sentiment"].value_counts()) print(df_train["rating"].value_counts()) print("Shape :", df_train.shape) print("preprocessing text ...") # we're embedding sentences with laserembedding # embedded sentence as input X # sentiment values as output Y X_train = self.preprocess_text_list(df_train["review_text"].values.tolist()) Y_train = df_train["rating"] # min max scale ratins Y_train = (Y_train - Y_train.min()) / (Y_train.max() - Y_train.min()) if verbose > 0: print("preprocessing done") print("training model ...") # creating model # every len of embedding sentence are equal to 1024, so for each sentence we have 1024 inputs and 1 output model = tf.keras.Sequential([ tf.keras.Input(shape=(1024,)), tf.keras.layers.Dense(512, activation=tf.keras.layers.LeakyReLU(alpha=0.1)), tf.keras.layers.BatchNormalization(), tf.keras.layers.Dropout(0.25), tf.keras.layers.Dense(128, activation=tf.keras.layers.LeakyReLU(alpha=0.1)), tf.keras.layers.BatchNormalization(), tf.keras.layers.Dropout(0.25), tf.keras.layers.Dense(32, activation=tf.keras.layers.LeakyReLU(alpha=0.1)), tf.keras.layers.BatchNormalization(), tf.keras.layers.Dropout(0.25), tf.keras.layers.Dense(8, activation=tf.keras.layers.LeakyReLU(alpha=0.1)), tf.keras.layers.BatchNormalization(), tf.keras.layers.Dropout(0.25), tf.keras.layers.Dense(1, activation=tf.keras.activations.sigmoid), ]) # train model self.model = self._train_model(model, X_train, Y_train, path_model_save=path_model_save, verbose=verbose) # save model self.model.save(path_model_save) return self.model def load_model(self, model_path): if not os.path.isfile(model_path): print(model_path, "is missing") else: self.model = tf.keras.models.load_model(model_path) print(model_path, "loaded") def model_predict(self, sentence_list): """ params: sentence : list(str) -> list of sentence return: if type(sentence_list) == list and self.model != False: return predictions else: return None """ if type(sentence_list) == list and self.model != False: return self.model.predict(self.preprocess_text_list(sentence_list)) return None
class myVectorizer_Laser(object): def fit(self, X): self.laser = Laser() return self def transform(self, X): return normalize(self.laser.embed_sentences(X, lang='ru'), norm='l2')
def get_laser_embeddings(x: List[str], lang: str, laser=None) -> np.ndarray: if laser is None: laser = Laser() return laser.embed_sentences(sentences=x, lang=lang)
def test_ja(): if SKIP_JA: pytest.skip("SKIP_JA is set") laser = Laser() assert laser.embed_sentences(['乾杯!'], lang='ja').shape == (1, 1024)
# 'use multilingual embeddings!'], # lang='en') # lang is only used for tokenization # # print ('') from problem_util_yr.loadDict.read_json_tool import read_json gene = read_json('./title_key_5w.json') ll = [] ii = 0 allpkl = [] for d in gene: ii += 1 if len(ll) < 10: ll.append(' '.join(d['title'])) else: embeddings = laser.embed_sentences(ll, lang='en') allpkl.append([ll, embeddings]) ll = [] ### if ii > 10000: break ### import pandas as pdd pdd.to_pickle(allpkl, './allpkl.pkl') # embeddings = laser.embed_sentences( # ['今 天 天 气 晴 朗', # '今 天 天 气 很 不 错', # '股 票 怎 么 跌 成 这 样'], # lang='en') # lang is only used for tokenization
import numpy as np nlp = spacy.load('pl_core_news_lg') model = make_pipeline( FunctionTransformer(lambda x: np.stack([nlp(t).vector for t in x])), Normalizer(), AgglomerativeClustering(distance_threshold=0.5, n_clusters=None), ) clusters = model.fit_predict(texts) print(clusters) # [2 0 2 0 1] from laserembeddings import Laser laser = Laser() model = make_pipeline( FunctionTransformer(lambda x: laser.embed_sentences(x, lang='en')), Normalizer(), AgglomerativeClustering(distance_threshold=0.8, n_clusters=None), ) clusters = model.fit_predict(texts) print(clusters) # [1 1 1 0 0] #results for each model from collections import defaultdict cluster2words = defaultdict(list) for text, cluster in zip(texts, clusters): for word in text.split(): if word not in cluster2words[cluster]: cluster2words[cluster].append(word) test = [wordlist for wordlist in cluster2words.values()]
def test_zh(): if SKIP_ZH: pytest.skip("SKIP_ZH is set") laser = Laser() assert laser.embed_sentences(['干杯!'], lang='zh').shape == (1, 1024)
from laserembeddings import Laser laser = Laser() # if all sentences are in the same language: # embeddings = laser.embed_sentences( # ['let your neural network be polyglot', # 'use multilingual embeddings!'], # lang='en') # lang is only used for tokenization # # print ('') embeddings = laser.embed_sentences( ['今 天 天 气 晴 朗', '今 天 天 气 很 不 错', '股 票 怎 么 跌 成 这 样'], lang='en') # lang is only used for tokenization # embeddings = laser.embed_sentences( # ['今天天气晴朗', # '今天天气很不错', # '股票怎么跌成这样'], # lang='zh') #使用jieba分词# lang is only used for tokenization # # print ('') # embeddings is a N*1024 (N = number of sentences) NumPy array import pandas as pdd pdd.to_pickle(embeddings, 'emb.pkl')
def get_sentence_vec(self, sentences): laser = Laser() sentence_embeddings = laser.embed_sentences(sentences, lang='en') return sentence_embeddings
def getDocumentEmbedding(doc, model_params: dict = {}, encoder = 'xlnet', model_name = 'xlnet-base-uncased'): #model = SentenceTransformer(model_name, model_params) #sentence_embedding = model.encode(doc) ## Word tokenizer from spacy.lang.en import English nlp = English() # Create a Tokenizer with the default settings for English including punctuation rules and exceptions tokenizer = nlp.Defaults.create_tokenizer(nlp) tokens = tokenizer("This is a sentence") if len(tokens) > getMaxLength(encoder): warnings.warn("The input sequence length exceeds the maximum limit.", Warning) if encoder in ['bert', 'xlnet', 'longformer', 'reformer', 'distilbert', 'roberta', 'bart', 'finbert']: # Use BERT for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) sentence_embeddings = model.encode(doc) elif encoder == 'use': #!pip install embedding-as-service from embedding_as_service.text.encode import Encoder en = Encoder(embedding='use', model='use_dan', max_seq_length=256) sentence_embeddings = en.encode(texts=doc) elif encoder == 'infersent': import nltk nltk.download('punkt') from models import InferSent params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2} infersent = InferSent(params_model) W2V_PATH = 'drive/My Drive/wiki-news-300d-1M.vec' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(sentences, tokenize=True) sentence_embeddings = infersent.encode(doc, tokenize=True) elif encoder == 'sent2vec': import sent2vec model = sent2vec.Sent2vecModel() model.load_model('drive/My Drive/torontobooks_unigram.bin') sentence_embeddings = model.embed_sentences(doc) elif encoder == 'laser': from laserembeddings import Laser laser = Laser() ## Also used for multilingual sentence embeddings sentence_embeddings = laser.embed_sentences(sentences, lang='en') return sentence_embeddings
df = pd.DataFrame({ 'review_text': data.review_text, 'rating': data.rating, }) # data cleaning df['review_text'] = df['review_text'].apply(lambda x: preprocessing(x)) df['rating'] = df['rating'].apply(lambda x: preprocessing(x)) # mixed data df = df.sample(frac=1).reset_index(drop=True) # instance laser laser = Laser() embed = laser.embed_sentences(df['review_text'], lang='en') # intit train and test # split data X_train = embed[:1600] X_test = embed[400:] y_train = df['rating'][:1600] y_test = df['rating'][400:] # Fitting a random forest classifier to the training data text_classifier = RandomForestClassifier(n_estimators=50) print("Fitting random forest to training data....")
class Singletons: __instance = None laser_embedder = cached_lq_dims = cached_intro_dims = None # robert_embedder = None @staticmethod def get_instance(): """Static access method""" if Singletons.__instance is None: logger.info("Calling private constructor for embedder initialization ") Singletons() return Singletons.__instance def __init__(self): if Singletons.__instance is not None: raise Exception("The singleton is already initialized you are attempting to initialize it again get lost") else: logger.info("Initializing Laser embedder") self.laser_embedder = Laser() self.cached_lq_dims = {} self.cached_intro_dims = {} # logger.info("Initializing Roberta embedder") # self.robert_embedder = SentenceTransformer(constants.fetch_constant("robeta_path")) Singletons.__instance = self def perform_embeddings(self, all_sentences): """ This method embeds all the sentences passed using Laser embedder :param all_sentences: :return: list of sentence embeddings """ if self.laser_embedder is not None: sentence_embeddings = self.laser_embedder.embed_sentences(all_sentences, ["en"] * len(all_sentences)) return sentence_embeddings else: logger.info("the embedder is not set please restart the service") # def perform_embeddings(self, all_sentences): # """ # This method embeds all the sentences passed using Laser embedder # :param all_sentences: # :return: list of sentence embeddings # """ # if self.robert_embedder is not None: # sentence_embeddings = self.robert_embedder.encode(all_sentences) # return sentence_embeddings # else: # logger.info("the embedder is not set please restart the service") def get_cached_lq_dims(self): """ :return: the dictionary of cached facets """ return self.cached_lq_dims def set_cached_lq_dims(self, facet_name, facet): """ :return: the dictionary of cached facets """ self.cached_lq_dims[facet_name] = facet def get_cached_intro_dims(self): """ :return: the dictionary of cached facets """ return self.cached_intro_dims def set_cached_intro_dims(self, facet_name, facet): """ :return: the dictionary of cached facets """ self.cached_intro_dims[facet_name] = facet