示例#1
0
def load_from_file(chakin_index, nb_dims, pre_fix, sub_folder, root_folder, save_dir):
    zip_file = os.path.join(root_folder, "{}.zip".format(sub_folder))
    zip_file_alt = pre_fix + zip_file[5:]
    unzip_folder = os.path.join(root_folder, sub_folder)
    if sub_folder[-1] == "d":
        golve_fname = os.path.join(unzip_folder, "{}.txt".format(sub_folder))
    else:
        golve_fname = os.path.join(unzip_folder, "{}.{}d.txt".format(sub_folder, nb_dims))
    
    if not os.path.exists(zip_file) and not os.path.exists(unzip_folder):
        print("Downloading embeddings to '{}'".format(zip_file))
        chakin.download(number=chakin_index, save_dir=save_dir)
    else:
        print("Embeddings has already been downloaded")
    
    if not os.path.exists(unzip_folder):
        import zipfile
        if not os.path.exists(zip_file) and os.path.exists(zip_file_alt):
            zip_file = zip_file_alt
        with zipfile.ZipFile(zip_file, "r") as zip_ref:
            print("Extracting embeddings to '{}'".format(unzip_folder))
            zip_ref.extractall(unzip_folder)
    else:
        print("Embeddings has already been extracted.")
    
    # Load indicies from disk
    word_to_embedding_dict = dict()
    index_to_embedding = []
    num_rep = 0
    j = 0
    with open(golve_fname, "r") as golve_file:
        for i, line in enumerate(golve_file):
            split = line.split(" ")
            word = split[0]
            representation = split[1:]

            representation = np.array(
                [float(val) for val in representation]
            )
            word_to_embedding_dict[word] = i
            index_to_embedding.append(representation)

            if num_rep == 0:
                num_rep = len(representation)
            
            j = i
    
    _word_not_found = np.array([0*0] * num_rep)

    j += 1
    word_to_embedding_dict["UNKNOWN"] = j
    index_to_embedding = np.array(_word_not_found + [_word_not_found])
    return word_to_embedding_dict, index_to_embedding
示例#2
0
    def _prepare_embeddings(self,
                            embedding_name: str,
                            embed_dim: int,
                            embed_zip_folder: str = None,
                            **read_csv_kwargs):
        tablename = self._tablename()

        with TemporaryDirectory() as d:
            if not self.embedding_exists():
                if not embed_zip_folder:
                    logging.info(
                        f"Can't find {embedding_name} locally. Started download."
                    )
                    download_filename = chakin.download(name=embedding_name,
                                                        save_dir=d)
                else:
                    d = embed_zip_folder  # Use the downloaded folder

                reader = self._parse_embeddings(d, embed_dim,
                                                **read_csv_kwargs)
                for chunk_df in reader:
                    # perform any transformations to these rows in memory
                    df = chunk_df.word_vec.str.split(" ", n=1,
                                                     expand=True).rename(
                                                         {
                                                             0: "word",
                                                             1: "vector_str"
                                                         },
                                                         axis=1)
                    df.to_sql(tablename, self.db, if_exists="append")

                # create an index for faster lookups
                self.cur.execute(
                    f"CREATE UNIQUE INDEX IF NOT EXISTS {tablename}_vector_str_idx ON {tablename}(vector_str)"
                )
示例#3
0
def embedding_weights_load(words_map, embedding_weights_path):
    pre_trained_embedding = None
    try:
        model = FastText.load_fasttext_format(embedding_weights_path)
        pre_trained_embedding = "bin"
    except:
        print("fastText binary file (.bin) is not found!")
        if os.path.exists("./Word_embedding/wiki.en.vec"):
            print("Using wikipedia(en) pre-trained word vectors.")
        else:
            print("Downloading wikipedia(en) pre-trained word vectors.")
            chakin.download(number=2, save_dir="./Word_embedding")
        print("Loading vectors...")
        if os.path.exists("./Word_embedding_model.pkl"):
            with open("./Word_embedding_model.pkl", mode="rb") as f:
                model = pickle.load(f)
        else:
            model = KeyedVectors.load_word2vec_format(
                './Word_embedding/wiki.en.vec')
            with open("Word_embedding_model.pkl", mode="wb") as f:
                pickle.dump(model, f)
        pre_trained_embedding = "txt"

    vocab_size = len(words_map)
    word_dimension = model['a'].shape[0]
    w = np.zeros((vocab_size, word_dimension), dtype=np.float32)

    for k, v in words_map.items():
        word = k
        word_number = v

        try:
            w[word_number][:] = model[word]
        except KeyError as e:
            if pre_trained_embedding == "bin":
                w[word_number][:] = model.seeded_vector(word)
            else:
                np.random.seed(word_number)
                w[word_number][:] = np.random.uniform(-0.25, 0.25,
                                                      word_dimension)
    return w
示例#4
0
def download_glove():
    global ZIP_FILE
    if not os.path.exists(ZIP_FILE) and not os.path.exists(UNZIP_FOLDER):
        # GloVe by Stanford is licensed Apache 2.0:
        #     https://github.com/stanfordnlp/GloVe/blob/master/LICENSE
        #     http://nlp.stanford.edu/data/glove.twitter.27B.zip
        #     Copyright 2014 The Board of Trustees of The Leland Stanford Junior University
        print("Downloading embeddings to '{}'".format(ZIP_FILE))
        chakin.download(number=CHAKIN_INDEX,
                        save_dir='./{}'.format(DATA_FOLDER))
    else:
        print("Embeddings already downloaded.")
    if not os.path.exists(UNZIP_FOLDER):
        import zipfile
        if not os.path.exists(ZIP_FILE) and os.path.exists(ZIP_FILE_ALT):
            ZIP_FILE = ZIP_FILE_ALT
        with zipfile.ZipFile(ZIP_FILE, "r") as zip_ref:
            print("Extracting embeddings to '{}'".format(UNZIP_FOLDER))
            zip_ref.extractall(UNZIP_FOLDER)
    else:
        print("Embeddings already extracted.")
示例#5
0
def embedding_weights_load(words_map, embeddingWeights_path):
    pre_trained_embedding = None
    try:

        model = FastText.load_fasttext_format(
            embeddingWeights_path)  #binファイルがある場合はそちらを読み込む
        pre_trained_embedding = "bin"

    except:
        print("fastText binary file (.bin) is not found!"
              )  #ない場合はwikipediaの分散表現を使用する
        if os.path.exists("./Word_embedding/wiki.en.vec"):
            print("Using wikipedia(en) pre-trained word vectors.")
        else:
            print("Downloading wikipedia(en) pre-trained word vectors.")
            chakin.download(number=2, save_dir="./Word_embedding")
        print("Loading vectors...")
        model = KeyedVectors.load_word2vec_format(
            './Word_embedding/wiki.en.vec')
        pre_trained_embedding = "txt"

    vocab_size = len(words_map)
    word_dimension = model['a'].shape[0]  #次元数を取得
    W = np.zeros((vocab_size, word_dimension),
                 dtype=np.float32)  #分散表現を格納するための行列
    for k, v in words_map.items():  #kには単語,vには単語ID
        word = k
        word_number = v
        #モデル中に存在しないチャンゴがある場合には、その単語の分散表現は乱数となる
        try:
            W[word_number][:] = model[word]
        except KeyError as e:
            if pre_trained_embedding == "bin":
                W[word_number][:] = model.seeded_vector(word)
            else:
                np.random.seed(word_number)
                W[word_number][:] = np.random.uniform(-0.25, 0.25,
                                                      word_dimension)
    return W
示例#6
0
文件: storage.py 项目: upura/chariot
    def chakin(self, lang="", number=-1, name=""):
        import chakin
        if lang:
            chakin.search(lang)
        elif number > -1 or name:
            path = self.data_path("external")
            if not os.path.exists(path):
                os.mkdir(path)

            table = chakin.downloader.load_datasets()

            index = number
            if number < 0:
                index = table.index[table["Name"] == name].tolist()
                index = index[0]

            _name = table.iloc[index]["Name"].lower()

            for ext in [".txt", ".vec"]:
                check_path = os.path.join(path, _name) + ext
                if os.path.exists(check_path):
                    return check_path

            vec_path = chakin.download(index, path)

            base, ext = os.path.splitext(vec_path)
            _dir = os.path.dirname(vec_path)
            if ext == ".vec":
                vec_path = os.rename(vec_path, os.path.join(_dir, _name + ext))
            elif ext in [".zip", ".gz"]:
                _path = self.expand(vec_path, ext)
                os.remove(vec_path)
                vec_path = _path

            return vec_path

        else:
            raise Exception("You have to specify lang to search or "
                            "number/name to download")
def downloadGlove(gloveFolderPath, gloveDim = 100): #<- downloads every Dimension, so the indx doesnt really matter
    print("Download Glove")
    gloveTwitterIDX = {  '25' : 17, "50" : 18, "100" : 19, "200" : 20 } #key = dim, value = index
    chakinIDX = gloveTwitterIDX[str(gloveDim)]
    zipFile = chakin.download(number=chakinIDX, save_dir='/tmp/glove')
    print("Unzip Glove")
    ##unzip
    unzipedPath = "/tmp/glove_unzipped"
    with zipfile.ZipFile(zipFile, 'r') as zip_ref:
        zip_ref.extractall(unzipedPath)
        
    ##Move
    dest = Path(gloveFolderPath)
    destAbsolute = dest.resolve()
    dest.mkdir(parents=True, exist_ok=True)

    for f in [join(unzipedPath, f) for f in listdir(unzipedPath)]:
        shutil.move(f, destAbsolute)
        
    ##Try to delete TMP
    try:
        Path("/tmp/").unlink()
    except:
        pass
ZIP_FILE_ALT = "glove" + ZIP_FILE[5:]  # sometimes it's lowercase only...
UNZIP_FOLDER = os.path.join(DATA_FOLDER, SUBFOLDER_NAME)
if SUBFOLDER_NAME[-1] == "d":
    GLOVE_FILENAME = os.path.join(UNZIP_FOLDER,
                                  "{}.txt".format(SUBFOLDER_NAME))
else:
    GLOVE_FILENAME = os.path.join(
        UNZIP_FOLDER, "{}.{}d.txt".format(SUBFOLDER_NAME,
                                          NUMBER_OF_DIMENSIONS))

if not os.path.exists(ZIP_FILE) and not os.path.exists(UNZIP_FOLDER):
    # GloVe by Stanford is licensed Apache 2.0:
    #     https://github.com/stanfordnlp/GloVe/blob/master/LICENSE
    #     http://nlp.stanford.edu/data/glove.twitter.27B.zip
    #     Copyright 2014 The Board of Trustees of The Leland Stanford Junior University
    print("Downloading embeddings to '{}'".format(ZIP_FILE))
    chakin.download(number=CHAKIN_INDEX, save_dir='./{}'.format(DATA_FOLDER))
else:
    print("Embeddings already downloaded.")

if not os.path.exists(UNZIP_FOLDER):
    import zipfile
    if not os.path.exists(ZIP_FILE) and os.path.exists(ZIP_FILE_ALT):
        ZIP_FILE = ZIP_FILE_ALT
    with zipfile.ZipFile(ZIP_FILE, "r") as zip_ref:
        print("Extracting embeddings to '{}'".format(UNZIP_FOLDER))
        zip_ref.extractall(UNZIP_FOLDER)
else:
    print("Embeddings already extracted.")

print('\nRun complete')
示例#9
0
import chakin
chakin.search(lang='English')
chakin.download(21, "/.")

if __name__ == '__main__':
    pass
示例#10
0
from typing import List
import os
from collections import defaultdict
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences
import chakin

# In[20]:

print("Searching for avaiable package.")
chakin.search(lang="English")
DOWNLOAD = bool(input("Download embedding? >>> ").upper() == "Y")
if DOWNLOAD:
    emb_idx = int(input("Index of embedding to download >>> "))
    save_dir = input("Directory to save embeddding ")
    chakin.download(number=emb_idx, save_dir="../data/")

# In[21]:

from data_import import load_embedding_from_disks

# In[22]:

# Parameter
# GLOVE_FILENAME = "../data/glove.840B.300d.txt"
GLOVE_FILENAME = "../data/glove.6B.50d.txt"

# In[23]:

df = pd.read_csv("./text_emotion.csv")
df.head()
示例#11
0
ZIP_FILE = os.path.join(pre_trained_dir, "{}.zip".format(SUBFOLDER_NAME))
ZIP_FILE_ALT = "glove" + ZIP_FILE[5:]  # sometimes it's lowercase only...
UNZIP_FOLDER = os.path.join(pre_trained_dir, SUBFOLDER_NAME)
if SUBFOLDER_NAME[-1] == "d":
    GLOVE_FILENAME = os.path.join(UNZIP_FOLDER, "{}.txt".format(SUBFOLDER_NAME))
else:
    GLOVE_FILENAME = os.path.join(UNZIP_FOLDER, "{}.{}d.txt".format(SUBFOLDER_NAME, NUMBER_OF_DIMENSIONS))

if not os.path.exists(ZIP_FILE) and not os.path.exists(UNZIP_FOLDER):
    # GloVe by Stanford is licensed Apache 2.0:
    #     https://github.com/stanfordnlp/GloVe/blob/master/LICENSE
    #     http://nlp.stanford.edu/data/glove.twitter.27B.zip
    #     Copyright 2014 The Board of Trustees of The Leland Stanford Junior University
    print("Downloading embeddings to '{}'".format(ZIP_FILE))
    chakin.download(number=CHAKIN_INDEX, save_dir='./{}'.format(pre_trained_dir))
else:
    print("Embeddings already downloaded.")

if not os.path.exists(UNZIP_FOLDER):
    import zipfile

    if not os.path.exists(ZIP_FILE) and os.path.exists(ZIP_FILE_ALT):
        ZIP_FILE = ZIP_FILE_ALT
    with zipfile.ZipFile(ZIP_FILE, "r") as zip_ref:
        print("Extracting embeddings to '{}'".format(UNZIP_FOLDER))
        zip_ref.extractall(UNZIP_FOLDER)
else:
    print("Embeddings already extracted.")
end = datetime.datetime.now()
print('ELMO Embeddings from Glove are generated in {} minutes'.format((end - start).seconds / 60))