def create_dataset(tweets, window, datafile="mapped_tweets.npy", export=True): if tweets is None: try: tweets = np.load(datafile).item() except FileNotFoundError: print("cannot find " + datafile) exit(1) contexts, neighbors = Word2Vec.create_dataset(tweets, window) if export: print("saving train set to file") contexts = np.array(contexts) neighbors = np.array(neighbors) contexts.tofile('./data/npcontexts.dat') neighbors.tofile('./data/npneighbors.dat')
def create_trainset(window, export=True): with open("mapped_comments.json") as f: comments = json.load(f) sentences = [] for key, index in zip(comments, range(len(comments))): progress(index, len(comments), "combining sentences") sentences.extend(comments[key]) sentences = list(filter(lambda x: x, sentences)) print("finished") sentences = np.array(sentences) contexts, neighbors = Word2Vec.create_dataset(sentences, window) if export: npc = np.array(contexts) npn = np.array(neighbors) npc.tofile('npcontexts.dat') npn.tofile('npneighbors.dat')