Пример #1
0
import codecs
import _pickle as pickle
from q3_sgd import load_saved_params
path = "utils/datasets/stanfordSentimentTreebank"
from utils.treebank import StanfordSentiment

dataset = StanfordSentiment()
sentences = dataset.sentences()
sentence = [codecs.decode(word, 'latin1') for word in sentences[0]]
" ".join(sentence)

dictionary = dict()
phrases = 0
with open(path + "/dictionary.txt", "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        splitted = line.split("|")
        dictionary[splitted[0].lower()] = int(splitted[1])
        phrases += 1

# sentences = []

# with open(path + "/datasetSentences.txt", "r", encoding='utf-8') as f:
#     for line in f:
#
#         splitted = line.strip().split()[1:]
#         # print(splitted)
#         # Deal with some peculiar encoding issues with this file
#
Пример #2
0
import random
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time

from q3_word2vec import *
from q3_sgd import *

# Reset the random seed to make sure that everyone gets the same results
random.seed(314)
dataset = StanfordSentiment()
dataset.sentences()
tokens = dataset.tokens()
nWords = len(tokens)

# We are going to train 10-dimensional vectors for this assignment
dimVectors = 10

# Context size
C = 5

# Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)

startTime = time.time()
wordVectors = np.concatenate(