def prepare_embeddings_for_dataset():
    """Write a file with documents embeddings."""
    print("The first run may take several minutes for preparation.")
    data_files = os.listdir("data")
    if "glove.6B.100d.txt" not in data_files:
        url = "http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt"
        download(
            "data/glove.6B.100d.txt",
            source_url=url,
        )

    tokenizer = NLTKTokenizer()
    embedder = GloVeEmbedder(load_path="data/glove.6B.100d.txt", pad_zero=True)

    with open("data/embeddings.csv", "w") as emb_file:
        for chunk in pd.read_csv(
                "data/prepared_dataset.csv",
                header=None,
                chunksize=10000,
                names=["Text"],
        ):
            embeddings: Array[float] = embedder(tokenizer(chunk.loc[:,
                                                                    "Text"]),
                                                mean=True)
            np.savetxt(emb_file, embeddings, delimiter=",", fmt="%1.5f")
示例#2
0
 def __init__(self,
              use_elmo=True,
              elmo_scale=1.,
              cap_scale=1.,
              use_cap_feat=False,
              use_glove=False,
              use_fastText=False,
              elmo_params={}):
     self.use_elmo = use_elmo
     self.elmo_scale = elmo_scale
     self.cap_scale = cap_scale
     self.use_cap_feat = use_cap_feat
     self.use_glove = use_glove
     self.use_fastText = use_fastText
     if self.use_elmo:
         self.elmo = ElmoEmbedder(**elmo_params)
     if self.use_cap_feat:
         self.cap_prep = CapitalizationPreprocessor()
     if self.use_glove:
         self.glove = GloVeEmbedder('embeddings/glove.6B/glove.6B.100d.txt',
                                    pad_zero=True)
     if self.use_fastText:
         self.fastText = FasttextEmbedder('embeddings/wiki.en.bin',
                                          pad_zero=True)
     self.embed_size = self.embed(['hehe']).shape[-1]
 def __init__(self, use_elmo=True, elmo_scale=1., use_cap_feat=False, use_glove=False):
     self.use_elmo = use_elmo
     self.elmo_scale = elmo_scale
     self.use_cap_feat = use_cap_feat
     self.use_glove = use_glove
     if self.use_elmo:
         self.elmo = ElmoEmbedder()
     if self.use_cap_feat:
         self.cap_prep = CapitalizationPreprocessor()
     if self.use_glove:
         self.glove = GloVeEmbedder('embeddings/glove.6B/glove.6B.100d.txt', pad_zero=True)
示例#4
0
def glove_generator(df,
                    batch_size,
                    method='average',
                    embedder=None,
                    only_labels=False):

    from deeppavlov.models.embedders.glove_embedder import GloVeEmbedder
    assert type(df.sentence[0]) is not str
    data = df.copy()

    if embedder is None:
        embedder = GloVeEmbedder(load_path="data/models/glove.txt",
                                 pad_zero=False)
    i = 0
    while True:

        batch_labels = data.label[i:i + batch_size]

        if only_labels:
            yield batch_labels

        else:
            batch_questions = embedder(data.question[i:i + batch_size])
            batch_sents = embedder(data.sentence[i:i + batch_size])

            if method == 'concat':
                batch_questions = np.vstack(
                    [Q.ravel() for Q in batch_questions])
                batch_sents = np.vstack([S.ravel() for S in batch_sents])
            elif method == 'average':
                batch_questions = np.vstack(
                    [np.array(Q).mean(axis=0) for Q in batch_questions])
                batch_sents = np.vstack(
                    [np.array(S).mean(axis=0) for S in batch_sents])
            else:
                raise NotImplementedError

            yield np.hstack((batch_questions, batch_sents)), batch_labels

        i += batch_size
        if i >= data.shape[0]:
            data = data.sample(frac=1.)
            i = 0
"""##Bag-of-Words"""

# initialize bag-of-words embedder giving total number of tokens
bow = BoWEmbedder(depth=token_vocab.len)
# it assumes indexed tokenized samples
bow(token_vocab(str_lower(tokenizer(['Kaggle is the best place to study machine learning.']))))

# all 10 tokens are in the vocabulary
sum(bow(token_vocab(str_lower(tokenizer(['Kaggle is the best place to study machine learning.']))))[0])

"""##GloVe Embedder"""

# Glove : https://nlp.stanford.edu/projects/glove/
simple_download(url="http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt", destination="./glove.6B.100d.txt")

embedder = GloVeEmbedder(load_path='./glove.6B.100d.txt',dim=100, pad_zero=True)

"""#Model Build"""

# get all train and valid data from iterator
x_train, y_train = train_iterator.get_instances(data_type="train")
x_valid, y_valid = train_iterator.get_instances(data_type="valid")

# Intialize `KerasClassificationModel` that composes CNN shallow-and-wide network 
# (name here as`cnn_model`)
cls = KerasClassificationModel(save_path="./cnn_model_v0", 
                               load_path="./cnn_model_v0", 
                               embedding_size=embedder.dim,
                               n_classes=classes_vocab.len,
                               model_name="cnn_model",
                               text_size=15, # number of tokens
def prepare_embeddings_for_query(query: str) -> Array[float]:
    """Return an embedding for query string."""
    tokenizer = NLTKTokenizer()
    embedder = GloVeEmbedder(load_path="data/glove.6B.100d.txt", pad_zero=True)
    embed_query: Array[float] = embedder(tokenizer([query]), mean=True)[0]
    return embed_query
                                   '<UNK>',
                               ),
                               unk_token='<UNK>')
token_vocab.fit(train_x_lower_tokenized)
token_vocab.save()
token_vocab.freqs.most_common()[:10]
tfidf = SklearnComponent(
    model_class="sklearn.feature_extraction.text:TfidfVectorizer",
    infer_method="transform",
    save_path='./tfidf_v0.pkl',
    load_path='./tfidf_v0.pkl',
    mode='train')
tfidf.fit(str_lower(train_iterator.get_instances(data_type='train')[0]))
tfidf.save()
embedder = GloVeEmbedder(
    "http://files.deeppavlov.ai/deeppavlov_data/bert/cased_L-12_H-768_A-12.zip",
    dim=100,
    pad_zero=True)
weighted_embedder = TfidfWeightedEmbedder(
    embedder=embedder,  # our GloVe embedder instance
    tokenizer=tokenizer,  # our tokenizer instance
    mean=True,  # to return one vector per sample
    vectorizer=tfidf  # our TF-IDF vectorizer
)
x_train, y_train = train_iterator.get_instances(data_type="train")
x_valid, y_valid = train_iterator.get_instances(data_type="test")
x_train = weighted_embedder(x_train)
x_valid = weighted_embedder(x_valid)
cls = SklearnComponent(model_class="sklearn.linear_model:LogisticRegression",
                       infer_method="predict",
                       save_path='./logreg_v0.pkl',
                       load_path='./logreg_v0.pkl',