def load_wv(vocabfile, wvfile): wv = loadtxt(wvfile, dtype=float) with open(vocabfile) as fd: words = [line.strip() for line in fd] num_to_word = dict(enumerate(words)) word_to_num = invert_dict(num_to_word) return wv, word_to_num, num_to_word
#wv_dummy = random.randn(10,50) #model = RNNLM(L0 = wv_dummy, U0 = wv_dummy, # alpha=0.005, rseed=10, bptt=4) #model.grad_check(array([1,2,3]), array([2,3,4])) from data_utils import utils as du import pandas as pd # Load the vocabulary vocab = pd.read_table("data/lm/vocab.ptb.txt", header=None, sep="\s+", index_col=0, names=['count', 'freq'], ) # Choose how many top words to keep vocabsize = 2000 num_to_word = dict(enumerate(vocab.index[:vocabsize])) word_to_num = du.invert_dict(num_to_word) ## # Below needed for 'adj_loss': DO NOT CHANGE fraction_lost = float(sum([vocab['count'][word] for word in vocab.index if (not word in word_to_num) and (not word == "UUUNKKK")])) fraction_lost /= sum([vocab['count'][word] for word in vocab.index if (not word == "UUUNKKK")]) print "Retained %d words from %d (%.02f%% of all tokens)" % (vocabsize, len(vocab), 100*(1-fraction_lost)) # Load the training set docs = du.load_dataset('data/lm/ptb-train.txt') S_train = du.docs_to_indices(docs, word_to_num) X_train, Y_train = du.seqs_to_lmXY(S_train)
import data_utils.utils as du import data_utils.ner as ner # Load the starter word vectors wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt', 'data/ner/wordVectors.txt') tagnames = ["O", "LOC", "MISC", "ORG", "PER"] num_to_tag = dict(enumerate(tagnames)) tag_to_num = du.invert_dict(num_to_tag) # Set window size windowsize = 3 # Load the training set docs = du.load_dataset('data/ner/train') X_train, y_train = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) # Load the dev set (for tuning hyperparameters) docs = du.load_dataset('data/ner/dev') X_dev, y_dev = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) # Load the test set (dummy labels only) docs = du.load_dataset('data/ner/test.masked') X_test, y_test = du.docs_to_windows(docs, word_to_num,
if __name__ == "__main__": # Load the vocabulary vocab = pd.read_table( "data/lm/vocab.ptb.txt", header=None, sep="\s+", index_col=0, names=['count', 'freq'], ) vocabsize = 2000 num_to_word = dict(enumerate(vocab.index[:vocabsize])) num_to_word_embedding = load_vocab_embeddings() word_to_num = du.invert_dict(num_to_word) # Load the training data _, S_train = load_data_as_sentences('data/lm/ptb-train.txt', word_to_num) in_word_index, out_word_index = convert_to_lm_dataset(S_train) assert len(in_word_index) == len(out_word_index) num_of_examples = len(in_word_index) random.seed(31415) np.random.seed(9265) in_word_index, out_word_index = shuffle_training_data( in_word_index, out_word_index) startTime = time.time() # Training should happen here # Initialize parameters randomly
import data_utils.ner as ner from softmax_example import SoftmaxRegression from nerwindow import WindowMLP import itertools from numpy import * from multiprocessing import Pool import random as rdm random.seed(10) wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt', 'data/ner/wordVectors.txt') tagnames = ["O", "LOC", "MISC", "ORG", "PER"] num_to_tag = dict(enumerate(tagnames)) tag_to_num = du.invert_dict(num_to_tag) windowsize = 3 docs = du.load_dataset('data/ner/train') X_train, y_train = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) docs = du.load_dataset('data/ner/dev') X_dev, y_dev = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) docs = du.load_dataset('data/ner/test.masked') X_test, y_test = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) nepoch = 5 N = nepoch * len(y_train) k = 5 # minibatch size
def main(): # Load the starter word vectors wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt', 'data/ner/wordVectors.txt') tagnames = ["O", "LOC", "MISC", "ORG", "PER"] num_to_tag = dict(enumerate(tagnames)) tag_to_num = du.invert_dict(num_to_tag) # Set window size windowsize = 3 # Load the training set docs = du.load_dataset('data/ner/train') X_train, y_train = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) # Load the dev set (for tuning hyperparameters) docs = du.load_dataset('data/ner/dev') X_dev, y_dev = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) # Load the test set (dummy labels only) docs = du.load_dataset('data/ner/test.masked') X_test, y_test = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) clf = WindowMLP(wv, windowsize=windowsize, dims=[None, 100, 5], reg=0.001, alpha=0.01) train_size = X_train.shape[0] """ costs = pickle.load(open("costs.dat", "rb")) clf = pickle.load(open("clf.dat", "rb")) """ nepoch = 5 N = nepoch * len(y_train) k = 5 # minibatch size costs = clf.train_sgd(X_train, y_train, idxiter=random_mini(k, N, train_size), printevery=10000, costevery=10000) pickle.dump(clf, open("clf.dat", "wb")) pickle.dump(costs, open("costs.dat", "wb")) plot_learning_curve(clf, costs) # Predict labels on the dev set yp = clf.predict(X_dev) # Save predictions to a file, one per line ner.save_predictions(yp, "dev.predicted") full_report(y_dev, yp, tagnames) # full report, helpful diagnostics eval_performance(y_dev, yp, tagnames) # performance: optimize this F1 # L: V x 50 # W[:,50:100]: 100 x 50 responses = clf.sparams.L.dot(clf.params.W[:, 50:100].T) # V x 100 index = np.argsort(responses, axis=0)[::-1] neurons = [1, 3, 4, 6, 8] # change this to your chosen neurons for i in neurons: print "Neuron %d" % i top_words = [num_to_word[k] for k in index[:10, i]] top_scores = [responses[k, i] for k in index[:10, i]] print_scores(top_scores, top_words)
#from rnn_simple import RNN_SIMPLE #from brnn import BRNN #from brnn_weighted import BRNN_WEIGHTED #from rnn_weighted import RNN_WEIGHTED from data_utils import utils as du import pandas as pd from misc import * N_ASPECTS = 5 SENT_DIM = 3 # Load the vocabulary vocab = pd.read_table("worddic.txt",header=None,sep="\s+",index_col=0) n2w = dict(enumerate(vocab.index)) w2n = du.invert_dict(n2w) vocabsize = len(w2n) num2word =dict(enumerate(w2n)) word2num = du.invert_dict(num2word) print "Number of unique words:",len(num2word) ############## filename_train = 'x_train.txt'#'reviews_plain.txt' filename_dev = 'x_dev.txt' X_train = read_data(filename_train,word2num) X_dev = read_data(filename_dev,word2num)
# Load the vocabulary #vocab = pd.read_table("data/lm/vocab.ptb.txt", header=None, sep="\s+", # index_col=0, names=['count', 'freq'], ) vocab2 = pd.read_table("worddic.txt",header=None,sep="\s+",index_col=0) # Choose how many top words to keep #vocabsize = 2000 vocabsize2 = 58868 #remove for implemenation #num_to_word = dict(enumerate(vocab.index[:vocabsize])) num_to_word2 = dict(enumerate(vocab.index[:vocabsize])) #word_to_num = du.invert_dict(num_to_word) word_to_num2 = du.invert_dict(num_to_word2) #print word_to_num2 ############## filename = 'reviews_plain.txt' print "Opening the file..." X_train = [] f = open(filename,'r') count = 0 for line in f.readlines(): sentence = [] line = line.strip()
dropped = 0 for i in word_vocab.keys(): if word_vocab[i] < 5: dropped += word_vocab[i] else: if i not in word_to_num: word_to_num[i] = j j = j + 1 word_to_num["<s>"] = j word_to_num["</s>"] = j + 1 mx = j + 1 print "dropped: " + str(float(dropped)/sum(word_vocab.values())) num_to_word = du.invert_dict(word_to_num) vocabsize = len(num_to_word) + 2 #One for line ending and one for unknown ################################ #Prepare data for training. ################################ X_train = [] Y_train = [] for i in alltxt: txt = ''.join(l for l in i if ord(l) < 128) temparr = [word_to_num["<s>"]] # Start the sentence for j in nltk.word_tokenize(txt): if j in word_to_num:
if __name__ == "__main__": # Load the vocabulary vocab = pd.read_table( "data/lm/vocab.ptb.txt", header=None, sep="\s+", index_col=0, names=['count', 'freq'], ) vocabsize = 2000 num_to_word = dict(enumerate(vocab.index[:vocabsize])) num_to_word_embedding = load_vocab_embeddings() word_to_num = utils.invert_dict(num_to_word) # Load the training data _, S_train = load_data_as_sentences('data/lm/ptb-train.txt', word_to_num) in_word_index, out_word_index = convert_to_lm_dataset(S_train) assert len(in_word_index) == len(out_word_index) num_of_examples = len(in_word_index) random.seed(31415) np.random.seed(9265) in_word_index, out_word_index = shuffle_training_data( in_word_index, out_word_index) startTime = time.time() # Training should happen here # Initialize parameters randomly
dropped = 0 for i in word_vocab.keys(): if word_vocab[i] < 5: dropped += word_vocab[i] else: if i not in word_to_num: word_to_num[i] = j j = j + 1 word_to_num["<s>"] = j word_to_num["</s>"] = j + 1 mx = j + 1 print "dropped: " + str(float(dropped) / sum(word_vocab.values())) num_to_word = du.invert_dict(word_to_num) vocabsize = len(num_to_word) + 2 #One for line ending and one for unknown ################################ #Prepare data for training. ################################ X_train = [] Y_train = [] for i in alltxt: txt = ''.join(l for l in i if ord(l) < 128) temparr = [word_to_num["<s>"]] # Start the sentence for j in nltk.word_tokenize(txt): if j in word_to_num: temparr.append(word_to_num[j])