import codecs import _pickle as pickle from q3_sgd import load_saved_params path = "utils/datasets/stanfordSentimentTreebank" from utils.treebank import StanfordSentiment dataset = StanfordSentiment() sentences = dataset.sentences() sentence = [codecs.decode(word, 'latin1') for word in sentences[0]] " ".join(sentence) dictionary = dict() phrases = 0 with open(path + "/dictionary.txt", "r") as f: for line in f: line = line.strip() if not line: continue splitted = line.split("|") dictionary[splitted[0].lower()] = int(splitted[1]) phrases += 1 # sentences = [] # with open(path + "/datasetSentences.txt", "r", encoding='utf-8') as f: # for line in f: # # splitted = line.strip().split()[1:] # # print(splitted) # # Deal with some peculiar encoding issues with this file #
import random import numpy as np from utils.treebank import StanfordSentiment import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import time from q3_word2vec import * from q3_sgd import * # Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = StanfordSentiment() dataset.sentences() tokens = dataset.tokens() nWords = len(tokens) # We are going to train 10-dimensional vectors for this assignment dimVectors = 10 # Context size C = 5 # Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) startTime = time.time() wordVectors = np.concatenate(