def prepare_embeddings_for_dataset(): """Write a file with documents embeddings.""" print("The first run may take several minutes for preparation.") data_files = os.listdir("data") if "glove.6B.100d.txt" not in data_files: url = "http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt" download( "data/glove.6B.100d.txt", source_url=url, ) tokenizer = NLTKTokenizer() embedder = GloVeEmbedder(load_path="data/glove.6B.100d.txt", pad_zero=True) with open("data/embeddings.csv", "w") as emb_file: for chunk in pd.read_csv( "data/prepared_dataset.csv", header=None, chunksize=10000, names=["Text"], ): embeddings: Array[float] = embedder(tokenizer(chunk.loc[:, "Text"]), mean=True) np.savetxt(emb_file, embeddings, delimiter=",", fmt="%1.5f")
def __init__(self, use_elmo=True, elmo_scale=1., cap_scale=1., use_cap_feat=False, use_glove=False, use_fastText=False, elmo_params={}): self.use_elmo = use_elmo self.elmo_scale = elmo_scale self.cap_scale = cap_scale self.use_cap_feat = use_cap_feat self.use_glove = use_glove self.use_fastText = use_fastText if self.use_elmo: self.elmo = ElmoEmbedder(**elmo_params) if self.use_cap_feat: self.cap_prep = CapitalizationPreprocessor() if self.use_glove: self.glove = GloVeEmbedder('embeddings/glove.6B/glove.6B.100d.txt', pad_zero=True) if self.use_fastText: self.fastText = FasttextEmbedder('embeddings/wiki.en.bin', pad_zero=True) self.embed_size = self.embed(['hehe']).shape[-1]
def __init__(self, use_elmo=True, elmo_scale=1., use_cap_feat=False, use_glove=False): self.use_elmo = use_elmo self.elmo_scale = elmo_scale self.use_cap_feat = use_cap_feat self.use_glove = use_glove if self.use_elmo: self.elmo = ElmoEmbedder() if self.use_cap_feat: self.cap_prep = CapitalizationPreprocessor() if self.use_glove: self.glove = GloVeEmbedder('embeddings/glove.6B/glove.6B.100d.txt', pad_zero=True)
def glove_generator(df, batch_size, method='average', embedder=None, only_labels=False): from deeppavlov.models.embedders.glove_embedder import GloVeEmbedder assert type(df.sentence[0]) is not str data = df.copy() if embedder is None: embedder = GloVeEmbedder(load_path="data/models/glove.txt", pad_zero=False) i = 0 while True: batch_labels = data.label[i:i + batch_size] if only_labels: yield batch_labels else: batch_questions = embedder(data.question[i:i + batch_size]) batch_sents = embedder(data.sentence[i:i + batch_size]) if method == 'concat': batch_questions = np.vstack( [Q.ravel() for Q in batch_questions]) batch_sents = np.vstack([S.ravel() for S in batch_sents]) elif method == 'average': batch_questions = np.vstack( [np.array(Q).mean(axis=0) for Q in batch_questions]) batch_sents = np.vstack( [np.array(S).mean(axis=0) for S in batch_sents]) else: raise NotImplementedError yield np.hstack((batch_questions, batch_sents)), batch_labels i += batch_size if i >= data.shape[0]: data = data.sample(frac=1.) i = 0
"""##Bag-of-Words""" # initialize bag-of-words embedder giving total number of tokens bow = BoWEmbedder(depth=token_vocab.len) # it assumes indexed tokenized samples bow(token_vocab(str_lower(tokenizer(['Kaggle is the best place to study machine learning.'])))) # all 10 tokens are in the vocabulary sum(bow(token_vocab(str_lower(tokenizer(['Kaggle is the best place to study machine learning.']))))[0]) """##GloVe Embedder""" # Glove : https://nlp.stanford.edu/projects/glove/ simple_download(url="http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt", destination="./glove.6B.100d.txt") embedder = GloVeEmbedder(load_path='./glove.6B.100d.txt',dim=100, pad_zero=True) """#Model Build""" # get all train and valid data from iterator x_train, y_train = train_iterator.get_instances(data_type="train") x_valid, y_valid = train_iterator.get_instances(data_type="valid") # Intialize `KerasClassificationModel` that composes CNN shallow-and-wide network # (name here as`cnn_model`) cls = KerasClassificationModel(save_path="./cnn_model_v0", load_path="./cnn_model_v0", embedding_size=embedder.dim, n_classes=classes_vocab.len, model_name="cnn_model", text_size=15, # number of tokens
def prepare_embeddings_for_query(query: str) -> Array[float]: """Return an embedding for query string.""" tokenizer = NLTKTokenizer() embedder = GloVeEmbedder(load_path="data/glove.6B.100d.txt", pad_zero=True) embed_query: Array[float] = embedder(tokenizer([query]), mean=True)[0] return embed_query
'<UNK>', ), unk_token='<UNK>') token_vocab.fit(train_x_lower_tokenized) token_vocab.save() token_vocab.freqs.most_common()[:10] tfidf = SklearnComponent( model_class="sklearn.feature_extraction.text:TfidfVectorizer", infer_method="transform", save_path='./tfidf_v0.pkl', load_path='./tfidf_v0.pkl', mode='train') tfidf.fit(str_lower(train_iterator.get_instances(data_type='train')[0])) tfidf.save() embedder = GloVeEmbedder( "http://files.deeppavlov.ai/deeppavlov_data/bert/cased_L-12_H-768_A-12.zip", dim=100, pad_zero=True) weighted_embedder = TfidfWeightedEmbedder( embedder=embedder, # our GloVe embedder instance tokenizer=tokenizer, # our tokenizer instance mean=True, # to return one vector per sample vectorizer=tfidf # our TF-IDF vectorizer ) x_train, y_train = train_iterator.get_instances(data_type="train") x_valid, y_valid = train_iterator.get_instances(data_type="test") x_train = weighted_embedder(x_train) x_valid = weighted_embedder(x_valid) cls = SklearnComponent(model_class="sklearn.linear_model:LogisticRegression", infer_method="predict", save_path='./logreg_v0.pkl', load_path='./logreg_v0.pkl',