Exemplo n.º 1
0
    def chakin(self, lang="", number=-1, name=""):
        import chakin
        if lang:
            chakin.search(lang)
        elif number > -1 or name:
            path = self.data_path("external")
            if not os.path.exists(path):
                os.mkdir(path)

            table = chakin.downloader.load_datasets()

            index = number
            if number < 0:
                index = table.index[table["Name"] == name].tolist()
                index = index[0]

            _name = table.iloc[index]["Name"].lower()

            for ext in [".txt", ".vec"]:
                check_path = os.path.join(path, _name) + ext
                if os.path.exists(check_path):
                    return check_path

            vec_path = chakin.download(index, path)

            base, ext = os.path.splitext(vec_path)
            _dir = os.path.dirname(vec_path)
            if ext == ".vec":
                vec_path = os.rename(vec_path, os.path.join(_dir, _name + ext))
            elif ext in [".zip", ".gz"]:
                _path = self.expand(vec_path, ext)
                os.remove(vec_path)
                vec_path = _path

            return vec_path

        else:
            raise Exception("You have to specify lang to search or "
                            "number/name to download")
# text files for use in language models.

# Note that the downloading process can take about 10 minutes to complete.

import numpy as np
import tensorflow as tf

# Python chakin package previously installed by
#    pip install chakin
import chakin

import json
import os
from collections import defaultdict

chakin.search(lang='English')  # lists available indices in English

# Specify English embeddings file to download and install
# by index number, number of dimensions, and subfoder name
# Note that GloVe 50-, 100-, 200-, and 300-dimensional folders
# are downloaded with a single zip download
CHAKIN_INDEX = 11
NUMBER_OF_DIMENSIONS = 50
SUBFOLDER_NAME = "gloVe.6B"

DATA_FOLDER = "embeddings"
ZIP_FILE = os.path.join(DATA_FOLDER, "{}.zip".format(SUBFOLDER_NAME))
ZIP_FILE_ALT = "glove" + ZIP_FILE[5:]  # sometimes it's lowercase only...
UNZIP_FOLDER = os.path.join(DATA_FOLDER, SUBFOLDER_NAME)
if SUBFOLDER_NAME[-1] == "d":
    GLOVE_FILENAME = os.path.join(UNZIP_FOLDER,
encoded_favs

# In[310]:

maxlen = max(len(x) for x in encoded_favs)
maxlen

# In[311]:

# pad the vectors to create uniform length
padded_favs = pad_sequences(encoded_favs, maxlen=maxlen, padding='post')
padded_favs

# In[312]:

chakin.search(lang='English')

# In[313]:

# Downloading Twitter.25d embeddings from Stanford:

CHAKIN_INDEX = 17
NUMBER_OF_DIMENSIONS = 25
SUBFOLDER_NAME = "glove.twitter.27B"

DATA_FOLDER = "embeddings"
ZIP_FILE = os.path.join(DATA_FOLDER, "{}.zip".format(SUBFOLDER_NAME))
ZIP_FILE_ALT = "glove" + ZIP_FILE[5:]  # sometimes it's lowercase only...
UNZIP_FOLDER = os.path.join(DATA_FOLDER, SUBFOLDER_NAME)
if SUBFOLDER_NAME[-1] == "d":
    GLOVE_FILENAME = os.path.join(UNZIP_FOLDER,
Exemplo n.º 4
0
import pandas as pd
import tensorflow as tf
import datetime
from sklearn.model_selection import train_test_split
import _pickle as pickle
from typing import List
import os
from collections import defaultdict
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences
import chakin

# In[20]:

print("Searching for avaiable package.")
chakin.search(lang="English")
DOWNLOAD = bool(input("Download embedding? >>> ").upper() == "Y")
if DOWNLOAD:
    emb_idx = int(input("Index of embedding to download >>> "))
    save_dir = input("Directory to save embeddding ")
    chakin.download(number=emb_idx, save_dir="../data/")

# In[21]:

from data_import import load_embedding_from_disks

# In[22]:

# Parameter
# GLOVE_FILENAME = "../data/glove.840B.300d.txt"
GLOVE_FILENAME = "../data/glove.6B.50d.txt"