def __init__(self, embedding_file, vocab_file, oov=-1, hidden1=50, hidden2=50, hidden3=50, win=7, alpha=0.025): self.hidden1, self.hidden2, self.hidden3 = hidden1, hidden2, hidden3 self.win = win self.alpha = alpha self.oov = oov self.embedding_matrix = self.load_embeddings(embedding_file) self.dimension = self.embedding_matrix.shape[-1] log.debug("Embedding matrix n rows: {}".format(self.embedding_matrix.shape[0])) log.debug("Embedding dimension: {}".format(self.dimension))
def create(self, reader): w_cn_lst = sort_vocab(update_counts(reader, downcase=self.downcase, sep=self.sep)) log.debug("Vocabulary size after sorting: {}.".format(len(w_cn_lst))) w_cn_lst = w_cn_lst[self.discard_n_top_freq:self.max_n_words] if self.min_freq > 1: w_cn_lst = prune_freq(w_cn_lst, self.min_freq) self.corpus_size = sum(i[1] for i in w_cn_lst) # create index self.w_index["<s>"] = 0 # for padding self.inv_w_index[0] = "<s>" for idx, (w, _) in enumerate(w_cn_lst, 1): self.w_index[w] = idx self.inv_w_index[idx] = w self.w_cn = dict(w_cn_lst) log.debug("Vocabulary size: {}.".format(len(self.w_index)))
def Sequence_Level(self, train_file, test_file, num_label, epochs): log.debug("Declaring theano vars.") random.seed(5) W1 = theano.shared(0.2 * random.random([self.win * self.dimension, self.hidden1]) - 0.1) W2 = theano.shared(0.2 * random.random([self.hidden1, self.hidden2]) - 0.1) W3 = theano.shared(0.2 * random.random([self.hidden2, self.hidden3]) - 0.1) U = theano.shared(0.2 * random.random([self.hidden3, num_label]) - 0.1) x = T.dmatrix("x") # len(l) by win*dimension y = T.lvector("y") learn_rate = T.scalar("learn_rate") A1 = T.dot(x, W1) B1 = A1 * (A1 > 0) A2 = T.dot(B1, W2) B2 = A2 * (A2 > 0) A3 = T.dot(B2, W3) B3 = A3 * (A3 > 0) G = T.dot(B3, U) L1 = T.nnet.softmax(G) # len(l) by num_label #L1=T.nnet.softmax(T.dot(T.tanh(T.dot(T.tanh(T.dot(T.tanh(T.dot(x,W1)),W2)),W3)),U)) cost = T.nnet.categorical_crossentropy(L1, y).mean() gw1, gw2, gw3, gu = T.grad(cost, [W1, W2, W3, U]) #gw_x = T.grad(cost, [x]) log.info("Compiling theano model.") f1 = theano.function(inputs=[x, y, learn_rate], outputs=[cost], updates=( (W1, W1 - learn_rate * gw1), (W2, W2 - learn_rate * gw2), (W3, W3 - learn_rate * gw3), (U, U - learn_rate * gu))) #f2 = theano.function(inputs=[x, y], outputs=cost) prediction = T.argmax(L1, axis=1) discrepancy = prediction - y f3 = theano.function(inputs=[x, y], outputs=[discrepancy,prediction]) #f4 = theano.function(inputs=[x, y], outputs=gw_x) alpha = self.alpha log.info("Read-in the training and test data.") open_train = open(train_file, "r") train_lines = open_train.readlines() open_test = open(test_file, "r") test_lines = open_test.readlines() log.info("Start training.") counter = 0 start = time.time() iter_ = epochs for j in range(0, iter_): log.info("Epoch: {}...".format(j+1)) x_ = [] y_ = [] for i in range(len(train_lines)): if i % 1000 == 0: log.debug(i) counter = counter + 1 current_alpha = alpha * (iter_ * len(train_lines) - counter) / (iter_ * len(train_lines)) if current_alpha < 0.01: current_alpha = 0.01 line_ = train_lines[i] G = line_.split("|") token_line = G[0] label_line = G[1] token_list = list(fromstring(token_line, dtype=int, sep=' ')) x_ = self.contextwin(token_list) # len(l) by win*dimension y_ = fromstring(label_line, dtype=int, sep=' ') f1(x_, y_, current_alpha) total_num = 0 total_value = 0 goldlabels = [] predictions = [] for i in range(len(test_lines)): line_ = test_lines[i] G = line_.split("|") token_line = G[0].strip() label_line = G[1].strip() y = fromstring(label_line, dtype=int, sep=' ') x = self.contextwin(list(fromstring(token_line, dtype=int, sep=' '))) total_num = total_num + x.shape[0] discrep, preds = f3(x, y) goldlabels.extend(list(y)) predictions.extend(list(preds)) total_value = total_value + x.shape[0] - count_nonzero(discrep) assert len(goldlabels) == len(predictions) log.info("f1 {}".format(f1_score(goldlabels, predictions, average="weighted"))) acc = 1.00 * total_value / total_num log.info("acc " + str(acc)) log.info("Training completed: {}s/epoch".format((time.time()-start)/iter_))
e1 = embs2[inst.w1_idx] e2 = embs2[inst.w2_idx] rel_embs2_1.append(e1) rel_embs2_2.append(e2) rel_rats2.append(inst.avg_rat) log.info("Calculating distances and correlation.") assert len(rel_embs2_1) == len(rel_embs2_2) == len(rel_rats2) scores2 = [] for e1, e2 in zip(rel_embs2_1, rel_embs2_2): scores2.append(cosine(e1, e2)) assert len(scores2) == len(rel_rats2) corr = spearman(scores, rel_rats) log.debug("{} embedded words found out of {}.".format(len(scores), len(d))) log.info("Correlation: {0[0]}, p-value: {0[1]}.".format(corr)) if args.ci: ci = bootstrap.ci((scores, rel_rats), statfunction=spearman, method="pi") log.info("CI: {0[0]} ({1}), {0[1]} (+{2}).".format(ci[:, 0], ci[:, 0][0] - corr[0], ci[:, 0][1] - corr[0])) if args.input_dir2: corr2 = spearman(scores2, rel_rats2) log.debug("Model2: {} embedded words found out of {}.".format(len(scores2), len(d))) log.info("Model2: Correlation: {0[0]}, p-value: {0[1]}.".format(corr2)) if args.ci: ci2 = bootstrap.ci((scores2, rel_rats2), statfunction=spearman, method="pi") log.info("Model2: CI: {0[0]}, {0[1]}.".format(ci2[:, 0], ci2[:, 0][0] - corr2[0], ci2[:, 0][1] - corr2[0])) #corr_between = spearman(scores, scores2) #log.info("Between-models: Correlation: {0[0]}, p-value: {0[1]}.".format(corr_between)) #sign = dependent_corr(corr[0], corr2[0], corr_between[0], n=len(rel_rats), twotailed=True, conf_level=0.95) #log.info("Significance: Test score: {0[0]}, p-value: {0[1]}.".format(sign))