예제 #1
0
def main():
    verify_cwd()
    if not os.path.exists("./data/"):
        print("creating folder...")
        os.mkdir("./data/")
    if not os.path.exists("./data/trees/"):
        print("preparing sentiment treebank...")
        try:
            pytreebank.load_sst("./data/")
        except:
            pass  # pytreebank downloader seems not robust under windows env. Actually we just want the data and the parser, so ignored.

    if not os.path.exists("./data/text8.zip"):
        print("retrieving text8...")
        urllib.request.urlretrieve("http://mattmahoney.net/dc/text8.zip",
                                   "./data/text8.zip")
    if not os.path.exists("./data/text8"):
        print("extracting text8...")
        with zipfile.ZipFile("./data/text8.zip", "r") as zip_ref:
            zip_ref.extractall("./data/")
    if not os.path.exists("./data/word2vec.model"):
        print("training word2vec...")
        train_word2vec()
    #if not os.path.exists("./data/glove.model"): glove training is slow. You should call it manually on create_pretrain_model.py
    #    print("training glove...")
    #    train_glove()
    print("=== ALL CLEAR! ===")
def main():
    verify_cwd()
    try:
        pytreebank.load_sst("./data/")
    except:
        pass  # pytreebank downloader seems not robust under windows env. Actually we just want the data and the parser, so ignored.
    train_data = pytreebank.import_tree_corpus("./data/trees/train.txt")
    assert (str(train_data[0]) == TARGET_STRING), "test fail for pytreebank."
    print("Correctness verified.")
예제 #3
0
def train_word2vec(sentences=None, nr_feature=None, save_name=None):
    verify_cwd()
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    #gensim.models.Word2Vec, may be we need to train it later
    if sentences is None:
        sentences = word2vec.Text8Corpus("./data/text8")
    if save_name is None:
        save_name = "./data/word2vec.model"
    if nr_feature is None:
        nr_feature = 200
    model = word2vec.Word2Vec(sentences, size=nr_feature)
    model.save(save_name)
예제 #4
0
def train_glove(sentences=None, nr_feature=None, save_name=None):
    verify_cwd()
    if sentences is None:
        print("preprocessing sentences...")
        sentences = list(
            itertools.islice(word2vec.Text8Corpus('./data/text8'), None))
        print("{} sentences found.".format(len(sentences)))
    if save_name is None:
        save_name = "./data/glove.model"
    if nr_feature is None:
        nr_feature = 200

    corpus = glove.Corpus()
    print("start fiting sentences...")
    corpus.fit(sentences, window=10)
    gl = glove.Glove(no_components=nr_feature, learning_rate=0.05)
    print("start training glove...")
    gl.fit(corpus.matrix,
           epochs=10,
           no_threads=multiprocessing.cpu_count(),
           verbose=True)
    corpus.save("./data/corpus.model")
    gl.save("./data/glove.model")
예제 #5
0
from definitions import verify_cwd
from analyzer.common.dataset import Dataset
from IPython import embed
from tqdm import tqdm
import pickle
import numpy as np
np.random.seed(42)

if __name__ == "__main__":
    verify_cwd()
    words = set()
    for vals in [
            Dataset.get_raw_train_dataset, Dataset.get_raw_val_dataset,
            Dataset.get_raw_test_dataset
    ]:
        for tree in tqdm(vals()):
            text = tree.to_lines()[0].split()
            for word in text:
                words.add(word.lower())
    pickle.dump(words, open("./data/words.pkl", "wb"))
    glove_dict = pickle.load(open("./data/glove_300.pkl", "rb"))
    print(len(words))
    missing = set()
    for word in words:
        if word not in glove_dict:
            missing.add(word)
            glove_dict[word] = np.random.rand(300) * 0.1 - 0.05
            #glove_dict[word] = np.zeros((300,))
    pickle.dump(glove_dict, open("./data/glove_300_aug.pkl", "wb"))
    print(len(missing))
    pickle.dump(missing, open("./data/missing.pkl", "wb"))