def test(dataset, metadata_path, w2v, rescale=None): print("Configuring Tensorflow Graph") with tf.Graph().as_default(): sess, siamese_model = initialize_tf_graph(metadata_path, w2v) dataset.test.open() avg_test_loss, avg_test_pco, test_result_set = evaluate( sess=sess, dataset=dataset.test, model=siamese_model, max_dev_itr=0, mode='test', step=-1) print('Average Pearson Correlation: {}\nAverage MSE: {}'.format( avg_test_pco, avg_test_loss)) dataset.test.close() _, _, sims, gt = test_result_set if rescale is not None: gt = datasets.rescale(gt, new_range=rescale, original_range=[0.0, 1.0]) figure_path = os.path.join(siamese_model.exp_dir, 'test_no_regression_sim.jpg') plt.ylabel('Ground Truth Similarities') plt.xlabel('Predicted Similarities') plt.scatter(sims, gt, label="Similarity", s=0.2) plt.savefig(figure_path) print("saved similarity plot at {}".format(figure_path))
def next_batch(self, batch_size=64, seq_begin=False, seq_end=False, rescale=(0.0, 1.0), pad=0, raw=False, keep_entities=False): if not self.datafile: raise Exception('The dataset needs to be open before being used. ' 'Please call dataset.open() before calling ' 'dataset.next_batch()') datasets.validate_rescale(rescale) s1s, s2s, sims = [], [], [] while len(s1s) < batch_size: row = self.datafile.readline() if row == '': self._epochs_completed += 1 self.datafile.seek(0) continue cols = row.strip().split('\t') s1, s2, sim = cols[0], cols[1], float(cols[2]) s1, s2 = s1.split(' '), s2.split(' ') # convert to dependency tree s1s.append(s1) s2s.append(s2) sims.append(sim) if not keep_entities: s1s = self.remove_entities(s1s) s2s = self.remove_entities(s2s) if not raw: s1s = datasets.seq2id(s1s[:batch_size], self.vocab_w2i, seq_begin, seq_end) s2s = datasets.seq2id(s2s[:batch_size], self.vocab_w2i, seq_begin, seq_end) else: s1s = datasets.append_seq_markers(s1s[:batch_size], seq_begin, seq_end) s2s = datasets.append_seq_markers(s2s[:batch_size], seq_begin, seq_end) if pad != 0: s1s = datasets.padseq(s1s, pad, raw) s2s = datasets.padseq(s2s, pad, raw) batch = self.Batch( s1=s1s, s2=s2s, sim=datasets.rescale(sims[:batch_size], rescale, (0.0, 1.0))) return batch
def next_batch(self, batch_size=64, format='one_hot', rescale=None, pad=0, raw=False, tokenizer='spacy'): samples = None if self._index_in_epoch + batch_size > len(self.data): samples = self.data[self._index_in_epoch:len(self.data)] random.shuffle(self.data) missing_samples = batch_size - (len(self.data) - self._index_in_epoch) self._epochs_completed += 1 samples.extend(self.data[0:missing_samples]) self._index_in_epoch = missing_samples else: samples = self.data[self._index_in_epoch:self._index_in_epoch + batch_size] self._index_in_epoch += batch_size x, y = zip(*samples) # Generate sequences x = self.generate_sequences(x, tokenizer) lens = [len(s) if pad == 0 else min(pad, len(s)) for s in x] if (raw): return self.Batch(x=x, y=y, lengths=lens) if (format == 'one_hot'): y = to_categorical(y, nb_classes=3) if (rescale is not None): datasets.validate_rescale(rescale) y = datasets.rescale(y, rescale, (0.0, 2.0)) batch = self.Batch(x=datasets.padseq( datasets.seq2id(x, self.vocab_w2i), pad), y=y, lengths=lens) return batch
def test(self, X, Y): """Evaluate model performance. Args: X (np.array): Testing features. Y (np.array): Testing labels. """ X = X.astype(np.float64) # normalize X if self.normalize == 'rescaling': X = ds.rescale(X, self.min, self.max, 0, 1) # add bias vector to all samples X = np.concatenate((X, np.ones((X.shape[0], 1))), axis=1) scores = np.dot(self.theta, X.T) return np.mean(scores - Y)
def train(self, X, Y): """Train LR model given benign samples. Args: X (np.array): Training matrix of shape (samples, features) """ X = X.astype(np.float64) # normalize X if self.normalize == 'rescaling': self.min = X.min(axis=0) self.max = X.max(axis=0) X = ds.rescale(X, self.min, self.max, 0, 1) # add bias vector to all samples X = np.concatenate((X, np.ones((X.shape[0], 1))), axis=1) xTx = X.T.dot(X) inv = np.linalg.inv(xTx + self.reg_param * np.eye(self.num_features+1)) self.theta = inv.dot(X.T).dot(Y) return self.theta
def next_batch(self, batch_size=64, seq_begin=False, seq_end=False, rescale=None, pad=0, raw=False, mark_entities=False, tokenizer='spacy', sentence_pad=0, one_hot=False): if not self.datafile: raise Exception('The dataset needs to be open before being used. ' 'Please call dataset.open() before calling ' 'dataset.next_batch()') text, sentences, ratings, titles, lengths = [], [], [], [], [] while len(text) < batch_size: row = self.datafile.readline() if row == '': self._epochs_completed += 1 self.close() self.datafile = open(self.path_list[self.epochs_completed % len(self.path_list)]) continue json_obj = json.loads(row.strip()) text.append(datasets.tokenize(json_obj["review_text"], tokenizer)) lengths.append(len(text[-1])) sentences.append(datasets.sentence_tokenizer(json_obj["review_text"])) ratings.append(int(json_obj["review_rating"])) titles.append(datasets.tokenize(json_obj["review_header"])) if rescale is not None and one_hot == False: ratings = datasets.rescale(ratings, rescale, [1.0, 5.0]) elif rescale is None and one_hot == True: ratings = [x - 1 for x in ratings] ratings = to_categorical(ratings, nb_classes=5) elif rescale is None and one_hot == False: pass else: raise ValueError('rescale and one_hot cannot be set together') if mark_entities: text = datasets.mark_entities(text, lang='de') titles = datasets.mark_entities(titles, lang='de') sentences = [datasets.mark_entities(sentence, lang='de') for sentence in sentences] if not raw: text = datasets.seq2id(text[:batch_size], self.vocab_w2i, seq_begin, seq_end) titles = datasets.seq2id(titles[:batch_size], self.vocab_w2i, seq_begin, seq_end) sentences = [datasets.seq2id(sentence, self.vocab_w2i, seq_begin, seq_end) for sentence in sentences[:batch_size]] else: text = datasets.append_seq_markers(text[:batch_size], seq_begin, seq_end) titles = datasets.append_seq_markers(titles[:batch_size], seq_begin, seq_end) sentences = [datasets.append_seq_markers(sentence, seq_begin, seq_end) for sentence in sentences[:batch_size]] if pad != 0: text = datasets.padseq(text[:batch_size], pad, raw) titles = datasets.padseq(titles[:batch_size], pad, raw) sentences = [datasets.padseq(sentence, pad, raw) for sentence in sentences[:batch_size]] if sentence_pad != 0: sentences = [datasets.pad_sentences(sentence, sentence_pad, raw) for sentence in sentences[:batch_size]] batch = self.Batch(text=text, sentences=sentences, ratings=ratings, titles=titles, lengths=lengths) return batch
def results(dataset, metadata_path, w2v, rescale=None): print("Configuring Tensorflow Graph") with tf.Graph().as_default(): sess, siamese_model = initialize_tf_graph(metadata_path, w2v) dataset.test.open() dataset.train.open() avg_test_loss, avg_test_pco, test_result_set = evaluate( sess=sess, dataset=dataset.test, model=siamese_model, step=-1, max_dev_itr=0, mode='test') avg_train_loss, avg_train_pco, train_result_set = evaluate( sess=sess, dataset=dataset.train, model=siamese_model, max_dev_itr=0, step=-1, mode='train') dataset.test.close() dataset.train.close() print('TEST RESULTS:\nMSE: {}\t Pearson Correlation: {}\n\n' 'TRAIN RESULTS:\nMSE: {}\t Pearson Correlation: {}'.format( avg_test_loss, avg_test_pco, avg_train_loss, avg_train_pco)) _, _, train_sims, train_gt = train_result_set _, _, test_sims, test_gt = test_result_set grid = np.r_[0:1:1000j] if rescale is not None: train_gt = datasets.rescale(train_gt, new_range=rescale, original_range=[0.0, 1.0]) test_gt = datasets.rescale(test_gt, new_range=rescale, original_range=[0.0, 1.0]) # grid = np.r_[rescale[0]:rescale[1]:1000j] figure_path = os.path.join(siamese_model.exp_dir, 'results_test_sim.jpg') reg_fig_path = os.path.join(siamese_model.exp_dir, 'results_line_fit.jpg') plt.title('Regression Plot for Test Set Similarities') plt.ylabel('Ground Truth Similarities') plt.xlabel('Predicted Similarities') print("Performing Non Parametric Regression") non_param_reg = non_parametric_regression( train_sims, train_gt, method=npr_methods.SpatialAverage()) reg_test_sim = non_param_reg(test_sims) reg_pco = pearsonr(reg_test_sim, test_gt) reg_mse = mean_squared_error(test_gt, reg_test_sim) print("Post Regression Test Results:\nPCO: {}\nMSE: {}".format( reg_pco, reg_mse)) plt.scatter(reg_test_sim, test_gt, label='Similarities', s=0.2) plt.savefig(figure_path) plt.clf() plt.title('Regression Plot for Test Set Similarities') plt.ylabel('Ground Truth Similarities') plt.xlabel('Predicted Similarities') plt.scatter(test_sims, test_gt, label='Similarities', s=0.2) plt.plot(grid, non_param_reg(grid), label="Local Linear Smoothing", linewidth=2.0, color='r') plt.savefig(reg_fig_path) print("saved similarity plot at {}".format(figure_path)) print("saved regression plot at {}".format(reg_fig_path))
def next_batch(self, batch_size=64, seq_begin=False, seq_end=False, rescale=None, pad=0, raw=False, mark_entities=False, tokenizer='spacy', sentence_pad=0, one_hot=False): if not self.datafile: raise Exception('The dataset needs to be open before being used. ' 'Please call dataset.open() before calling ' 'dataset.next_batch()') text, sentences, ratings_service, ratings_cleanliness, \ ratings_overall, ratings_value, ratings_sleep_quality, ratings_rooms, \ titles, helpful_votes, lengths = [], [], [], [], [], [], [], [], [], [], [] while len(text) < batch_size: row = self.datafile.readline() if row == '': self._epochs_completed += 1 self.close() self.datafile = open(self.path_list[self.epochs_completed % len(self.path_list)]) continue json_obj = json.loads(row.strip()) text.append(datasets.tokenize(json_obj["text"], tokenizer)) lengths.append(len(text[-1])) sentences.append(datasets.sentence_tokenizer((json_obj["text"]))) ratings_service.append( int(json_obj["ratings"]["service"]) if 'service' in json_obj['ratings'] else int(json_obj['ratings']['overall'])) ratings_cleanliness.append( int(json_obj["ratings"]["cleanliness"]) if 'cleanliness' in json_obj['ratings'] else int(json_obj['ratings']['overall'])) ratings_overall.append(int(json_obj["ratings"]["overall"])) ratings_value.append( int(json_obj["ratings"]["value"]) if 'value' in json_obj['ratings'] else int(json_obj['ratings']['overall'])) ratings_sleep_quality.append( int(json_obj["ratings"]["sleep_quality"]) if 'sleep_quality' in json_obj['ratings'] else int(json_obj['ratings']['overall'])) ratings_rooms.append( int(json_obj["ratings"]["rooms"]) if 'rooms' in json_obj['ratings'] else int(json_obj['ratings']['overall'])) helpful_votes.append(json_obj["num_helpful_votes"]) titles.append(datasets.tokenize(json_obj["title"])) if rescale is not None and one_hot == False: ratings_service = datasets.rescale(ratings_service, rescale, [1.0, 5.0]) ratings_cleanliness = datasets.rescale(ratings_cleanliness, rescale, [1.0, 5.0]) ratings_overall = datasets.rescale(ratings_overall, rescale, [1.0, 5.0]) ratings_value = datasets.rescale(ratings_value, rescale, [1.0, 5.0]) ratings_sleep_quality = datasets.rescale(ratings_sleep_quality, rescale, [1.0, 5.0]) ratings_rooms = datasets.rescale(ratings_rooms, rescale, [1.0, 5.0]) elif rescale is None and one_hot == True: ratings_service = to_categorical([x - 1 for x in ratings_service], nb_classes=5) ratings_cleanliness = to_categorical( [x - 1 for x in ratings_cleanliness], nb_classes=5) ratings_overall = to_categorical([x - 1 for x in ratings_overall], nb_classes=5) ratings_value = to_categorical([x - 1 for x in ratings_value], nb_classes=5) ratings_sleep_quality = to_categorical( [x - 1 for x in ratings_sleep_quality], nb_classes=5) ratings_rooms = to_categorical([x - 1 for x in ratings_rooms], nb_classes=5) elif rescale is None and one_hot == False: pass else: raise ValueError('rescale and one_hot cannot be set together') if mark_entities: text = datasets.mark_entities(text) titles = datasets.mark_entities(titles) sentences = [ datasets.mark_entities(sentence) for sentence in sentences ] if not raw: text = datasets.seq2id(text[:batch_size], self.vocab_w2i, seq_begin, seq_end) titles = datasets.seq2id(titles[:batch_size], self.vocab_w2i, seq_begin, seq_end) sentences = [ datasets.seq2id(sentence, self.vocab_w2i, seq_begin, seq_end) for sentence in sentences[:batch_size] ] else: text = datasets.append_seq_markers(text[:batch_size], seq_begin, seq_end) titles = datasets.append_seq_markers(titles[:batch_size], seq_begin, seq_end) sentences = [ datasets.append_seq_markers(sentence, seq_begin, seq_end) for sentence in sentences[:batch_size] ] if pad != 0: text = datasets.padseq(text[:batch_size], pad, raw) titles = datasets.padseq(titles[:batch_size], pad, raw) sentences = [ datasets.padseq(sentence, pad, raw) for sentence in sentences[:batch_size] ] if sentence_pad != 0: sentences = [ datasets.pad_sentences(sentence, pad, raw) for sentence in sentences[:batch_size] ] batch = self.Batch(text=text, sentences=sentences, ratings_service=ratings_service, ratings_cleanliness=ratings_cleanliness, ratings=ratings_overall, ratings_value=ratings_value, ratings_sleep_quality=ratings_sleep_quality, ratings_rooms=ratings_rooms, titles=titles, helpful_votes=helpful_votes, lengths=lengths) return batch