示例#1
0
def create_dataset(tweets, window, datafile="mapped_tweets.npy", export=True):
    if tweets is None:
        try:
            tweets = np.load(datafile).item()
        except FileNotFoundError:
            print("cannot find " + datafile)
            exit(1)
    contexts, neighbors = Word2Vec.create_dataset(tweets, window)
    if export:
        print("saving train set to file")
        contexts = np.array(contexts)
        neighbors = np.array(neighbors)
        contexts.tofile('./data/npcontexts.dat')
        neighbors.tofile('./data/npneighbors.dat')
示例#2
0
def create_trainset(window, export=True):
    with open("mapped_comments.json") as f:
        comments = json.load(f)
    sentences = []
    for key, index in zip(comments, range(len(comments))):
        progress(index, len(comments), "combining sentences")
        sentences.extend(comments[key])

    sentences = list(filter(lambda x: x, sentences))
    print("finished")
    sentences = np.array(sentences)
    contexts, neighbors = Word2Vec.create_dataset(sentences, window)
    if export:
        npc = np.array(contexts)
        npn = np.array(neighbors)
        npc.tofile('npcontexts.dat')
        npn.tofile('npneighbors.dat')