示例#1
0
def test(dataset, metadata_path, w2v, rescale=None):
    print("Configuring Tensorflow Graph")
    with tf.Graph().as_default():
        sess, siamese_model = initialize_tf_graph(metadata_path, w2v)
        dataset.test.open()
        avg_test_loss, avg_test_pco, test_result_set = evaluate(
            sess=sess,
            dataset=dataset.test,
            model=siamese_model,
            max_dev_itr=0,
            mode='test',
            step=-1)
        print('Average Pearson Correlation: {}\nAverage MSE: {}'.format(
            avg_test_pco, avg_test_loss))
        dataset.test.close()
        _, _, sims, gt = test_result_set
        if rescale is not None:
            gt = datasets.rescale(gt,
                                  new_range=rescale,
                                  original_range=[0.0, 1.0])

        figure_path = os.path.join(siamese_model.exp_dir,
                                   'test_no_regression_sim.jpg')
        plt.ylabel('Ground Truth Similarities')
        plt.xlabel('Predicted  Similarities')
        plt.scatter(sims, gt, label="Similarity", s=0.2)
        plt.savefig(figure_path)
        print("saved similarity plot at {}".format(figure_path))
示例#2
0
    def next_batch(self, batch_size=64, seq_begin=False, seq_end=False,
                   rescale=(0.0, 1.0), pad=0, raw=False, keep_entities=False):
        if not self.datafile:
            raise Exception('The dataset needs to be open before being used. '
                            'Please call dataset.open() before calling '
                            'dataset.next_batch()')
        datasets.validate_rescale(rescale)

        s1s, s2s, sims = [], [], []

        while len(s1s) < batch_size:
            row = self.datafile.readline()
            if row == '':
                self._epochs_completed += 1
                self.datafile.seek(0)
                continue
            cols = row.strip().split('\t')
            s1, s2, sim = cols[0], cols[1], float(cols[2])
            s1, s2 = s1.split(' '), s2.split(' ')

            # convert to dependency tree

            s1s.append(s1)
            s2s.append(s2)
            sims.append(sim)

        if not keep_entities:
            s1s = self.remove_entities(s1s)
            s2s = self.remove_entities(s2s)

        if not raw:
            s1s = datasets.seq2id(s1s[:batch_size], self.vocab_w2i, seq_begin,
                                  seq_end)
            s2s = datasets.seq2id(s2s[:batch_size], self.vocab_w2i, seq_begin,
                                  seq_end)
        else:
            s1s = datasets.append_seq_markers(s1s[:batch_size], seq_begin, seq_end)
            s2s = datasets.append_seq_markers(s2s[:batch_size], seq_begin, seq_end)
        if pad != 0:
            s1s = datasets.padseq(s1s, pad, raw)
            s2s = datasets.padseq(s2s, pad, raw)
        batch = self.Batch(
            s1=s1s,
            s2=s2s,
            sim=datasets.rescale(sims[:batch_size], rescale, (0.0, 1.0)))
        return batch
示例#3
0
    def next_batch(self,
                   batch_size=64,
                   format='one_hot',
                   rescale=None,
                   pad=0,
                   raw=False,
                   tokenizer='spacy'):

        samples = None
        if self._index_in_epoch + batch_size > len(self.data):
            samples = self.data[self._index_in_epoch:len(self.data)]
            random.shuffle(self.data)
            missing_samples = batch_size - (len(self.data) -
                                            self._index_in_epoch)
            self._epochs_completed += 1
            samples.extend(self.data[0:missing_samples])
            self._index_in_epoch = missing_samples
        else:
            samples = self.data[self._index_in_epoch:self._index_in_epoch +
                                batch_size]
            self._index_in_epoch += batch_size

        x, y = zip(*samples)
        # Generate sequences
        x = self.generate_sequences(x, tokenizer)
        lens = [len(s) if pad == 0 else min(pad, len(s)) for s in x]

        if (raw):
            return self.Batch(x=x, y=y, lengths=lens)

        if (format == 'one_hot'):
            y = to_categorical(y, nb_classes=3)

        if (rescale is not None):
            datasets.validate_rescale(rescale)
            y = datasets.rescale(y, rescale, (0.0, 2.0))

        batch = self.Batch(x=datasets.padseq(
            datasets.seq2id(x, self.vocab_w2i), pad),
                           y=y,
                           lengths=lens)

        return batch
示例#4
0
    def test(self, X, Y):
        """Evaluate model performance.

        Args:
            X (np.array): Testing features.
            Y (np.array): Testing labels.
        """

        X = X.astype(np.float64)

        # normalize X
        if self.normalize == 'rescaling':
            X = ds.rescale(X, self.min, self.max, 0, 1)

        # add bias vector to all samples
        X = np.concatenate((X, np.ones((X.shape[0], 1))), axis=1)

        scores = np.dot(self.theta, X.T)

        return np.mean(scores - Y)
示例#5
0
    def train(self, X, Y):
        """Train LR model given benign samples.

        Args:
            X (np.array): Training matrix of shape (samples, features)
        """

        X = X.astype(np.float64)

        # normalize X
        if self.normalize == 'rescaling':
            self.min = X.min(axis=0)
            self.max = X.max(axis=0)
            X = ds.rescale(X, self.min, self.max, 0, 1)

        # add bias vector to all samples
        X = np.concatenate((X, np.ones((X.shape[0], 1))), axis=1)

        xTx = X.T.dot(X)
        inv = np.linalg.inv(xTx + self.reg_param * np.eye(self.num_features+1))
        self.theta = inv.dot(X.T).dot(Y)

        return self.theta
示例#6
0
    def next_batch(self, batch_size=64, seq_begin=False, seq_end=False,
                   rescale=None, pad=0, raw=False, mark_entities=False,
                   tokenizer='spacy', sentence_pad=0, one_hot=False):
        if not self.datafile:
            raise Exception('The dataset needs to be open before being used. '
                            'Please call dataset.open() before calling '
                            'dataset.next_batch()')
        text, sentences, ratings, titles, lengths = [], [], [], [], []

        while len(text) < batch_size:
            row = self.datafile.readline()
            if row == '':
                self._epochs_completed += 1
                self.close()
                self.datafile = open(self.path_list[self.epochs_completed % len(self.path_list)])
                continue
            json_obj = json.loads(row.strip())
            text.append(datasets.tokenize(json_obj["review_text"], tokenizer))
            lengths.append(len(text[-1]))
            sentences.append(datasets.sentence_tokenizer(json_obj["review_text"]))
            ratings.append(int(json_obj["review_rating"]))
            titles.append(datasets.tokenize(json_obj["review_header"]))


        if rescale is not None and one_hot == False:
            ratings = datasets.rescale(ratings, rescale, [1.0, 5.0])
        elif rescale is None and one_hot == True:
            ratings = [x - 1 for x in ratings]
            ratings = to_categorical(ratings, nb_classes=5)
        elif rescale is None and one_hot == False:
            pass
        else:
            raise ValueError('rescale and one_hot cannot be set together')
        if mark_entities:
            text = datasets.mark_entities(text, lang='de')
            titles = datasets.mark_entities(titles, lang='de')
            sentences = [datasets.mark_entities(sentence, lang='de')
                         for sentence in sentences]

        if not raw:
            text = datasets.seq2id(text[:batch_size], self.vocab_w2i, seq_begin,
                                  seq_end)
            titles = datasets.seq2id(titles[:batch_size], self.vocab_w2i,
                                     seq_begin, seq_end)
            sentences = [datasets.seq2id(sentence, self.vocab_w2i,
                     seq_begin, seq_end) for sentence in sentences[:batch_size]]
        else:
            text = datasets.append_seq_markers(text[:batch_size],
                                               seq_begin, seq_end)
            titles = datasets.append_seq_markers(titles[:batch_size],
                                                 seq_begin, seq_end)
            sentences = [datasets.append_seq_markers(sentence, seq_begin,
                         seq_end) for sentence in sentences[:batch_size]]

        if pad != 0:
            text = datasets.padseq(text[:batch_size], pad, raw)
            titles = datasets.padseq(titles[:batch_size], pad, raw)
            sentences = [datasets.padseq(sentence, pad, raw) for sentence in
                         sentences[:batch_size]]
        if sentence_pad != 0:
            sentences = [datasets.pad_sentences(sentence, sentence_pad, raw) for
                         sentence in sentences[:batch_size]]

        batch = self.Batch(text=text, sentences=sentences,
                           ratings=ratings, titles=titles, lengths=lengths)
        return batch
示例#7
0
def results(dataset, metadata_path, w2v, rescale=None):
    print("Configuring Tensorflow Graph")
    with tf.Graph().as_default():
        sess, siamese_model = initialize_tf_graph(metadata_path, w2v)
        dataset.test.open()
        dataset.train.open()
        avg_test_loss, avg_test_pco, test_result_set = evaluate(
            sess=sess,
            dataset=dataset.test,
            model=siamese_model,
            step=-1,
            max_dev_itr=0,
            mode='test')
        avg_train_loss, avg_train_pco, train_result_set = evaluate(
            sess=sess,
            dataset=dataset.train,
            model=siamese_model,
            max_dev_itr=0,
            step=-1,
            mode='train')
        dataset.test.close()
        dataset.train.close()
        print('TEST RESULTS:\nMSE: {}\t Pearson Correlation: {}\n\n'
              'TRAIN RESULTS:\nMSE: {}\t Pearson Correlation: {}'.format(
                  avg_test_loss, avg_test_pco, avg_train_loss, avg_train_pco))

        _, _, train_sims, train_gt = train_result_set
        _, _, test_sims, test_gt = test_result_set
        grid = np.r_[0:1:1000j]

        if rescale is not None:
            train_gt = datasets.rescale(train_gt,
                                        new_range=rescale,
                                        original_range=[0.0, 1.0])
            test_gt = datasets.rescale(test_gt,
                                       new_range=rescale,
                                       original_range=[0.0, 1.0])
            # grid = np.r_[rescale[0]:rescale[1]:1000j]

        figure_path = os.path.join(siamese_model.exp_dir,
                                   'results_test_sim.jpg')
        reg_fig_path = os.path.join(siamese_model.exp_dir,
                                    'results_line_fit.jpg')
        plt.title('Regression Plot for Test Set Similarities')
        plt.ylabel('Ground Truth Similarities')
        plt.xlabel('Predicted  Similarities')

        print("Performing Non Parametric Regression")
        non_param_reg = non_parametric_regression(
            train_sims, train_gt, method=npr_methods.SpatialAverage())

        reg_test_sim = non_param_reg(test_sims)
        reg_pco = pearsonr(reg_test_sim, test_gt)
        reg_mse = mean_squared_error(test_gt, reg_test_sim)
        print("Post Regression Test Results:\nPCO: {}\nMSE: {}".format(
            reg_pco, reg_mse))

        plt.scatter(reg_test_sim, test_gt, label='Similarities', s=0.2)
        plt.savefig(figure_path)

        plt.clf()

        plt.title('Regression Plot for Test Set Similarities')
        plt.ylabel('Ground Truth Similarities')
        plt.xlabel('Predicted  Similarities')
        plt.scatter(test_sims, test_gt, label='Similarities', s=0.2)
        plt.plot(grid,
                 non_param_reg(grid),
                 label="Local Linear Smoothing",
                 linewidth=2.0,
                 color='r')
        plt.savefig(reg_fig_path)

        print("saved similarity plot at {}".format(figure_path))
        print("saved regression plot at {}".format(reg_fig_path))
    def next_batch(self,
                   batch_size=64,
                   seq_begin=False,
                   seq_end=False,
                   rescale=None,
                   pad=0,
                   raw=False,
                   mark_entities=False,
                   tokenizer='spacy',
                   sentence_pad=0,
                   one_hot=False):
        if not self.datafile:
            raise Exception('The dataset needs to be open before being used. '
                            'Please call dataset.open() before calling '
                            'dataset.next_batch()')
        text, sentences, ratings_service, ratings_cleanliness, \
        ratings_overall, ratings_value, ratings_sleep_quality, ratings_rooms, \
        titles, helpful_votes, lengths = [], [], [], [], [], [], [], [], [], [], []

        while len(text) < batch_size:
            row = self.datafile.readline()
            if row == '':
                self._epochs_completed += 1
                self.close()
                self.datafile = open(self.path_list[self.epochs_completed %
                                                    len(self.path_list)])
                continue
            json_obj = json.loads(row.strip())
            text.append(datasets.tokenize(json_obj["text"], tokenizer))
            lengths.append(len(text[-1]))
            sentences.append(datasets.sentence_tokenizer((json_obj["text"])))
            ratings_service.append(
                int(json_obj["ratings"]["service"]) if 'service' in
                json_obj['ratings'] else int(json_obj['ratings']['overall']))
            ratings_cleanliness.append(
                int(json_obj["ratings"]["cleanliness"]) if 'cleanliness' in
                json_obj['ratings'] else int(json_obj['ratings']['overall']))
            ratings_overall.append(int(json_obj["ratings"]["overall"]))
            ratings_value.append(
                int(json_obj["ratings"]["value"]) if 'value' in
                json_obj['ratings'] else int(json_obj['ratings']['overall']))
            ratings_sleep_quality.append(
                int(json_obj["ratings"]["sleep_quality"]) if 'sleep_quality' in
                json_obj['ratings'] else int(json_obj['ratings']['overall']))
            ratings_rooms.append(
                int(json_obj["ratings"]["rooms"]) if 'rooms' in
                json_obj['ratings'] else int(json_obj['ratings']['overall']))
            helpful_votes.append(json_obj["num_helpful_votes"])
            titles.append(datasets.tokenize(json_obj["title"]))

        if rescale is not None and one_hot == False:
            ratings_service = datasets.rescale(ratings_service, rescale,
                                               [1.0, 5.0])
            ratings_cleanliness = datasets.rescale(ratings_cleanliness,
                                                   rescale, [1.0, 5.0])
            ratings_overall = datasets.rescale(ratings_overall, rescale,
                                               [1.0, 5.0])
            ratings_value = datasets.rescale(ratings_value, rescale,
                                             [1.0, 5.0])
            ratings_sleep_quality = datasets.rescale(ratings_sleep_quality,
                                                     rescale, [1.0, 5.0])
            ratings_rooms = datasets.rescale(ratings_rooms, rescale,
                                             [1.0, 5.0])
        elif rescale is None and one_hot == True:
            ratings_service = to_categorical([x - 1 for x in ratings_service],
                                             nb_classes=5)
            ratings_cleanliness = to_categorical(
                [x - 1 for x in ratings_cleanliness], nb_classes=5)
            ratings_overall = to_categorical([x - 1 for x in ratings_overall],
                                             nb_classes=5)
            ratings_value = to_categorical([x - 1 for x in ratings_value],
                                           nb_classes=5)
            ratings_sleep_quality = to_categorical(
                [x - 1 for x in ratings_sleep_quality], nb_classes=5)
            ratings_rooms = to_categorical([x - 1 for x in ratings_rooms],
                                           nb_classes=5)
        elif rescale is None and one_hot == False:
            pass
        else:
            raise ValueError('rescale and one_hot cannot be set together')

        if mark_entities:
            text = datasets.mark_entities(text)
            titles = datasets.mark_entities(titles)
            sentences = [
                datasets.mark_entities(sentence) for sentence in sentences
            ]

        if not raw:
            text = datasets.seq2id(text[:batch_size], self.vocab_w2i,
                                   seq_begin, seq_end)
            titles = datasets.seq2id(titles[:batch_size], self.vocab_w2i,
                                     seq_begin, seq_end)
            sentences = [
                datasets.seq2id(sentence, self.vocab_w2i, seq_begin, seq_end)
                for sentence in sentences[:batch_size]
            ]
        else:
            text = datasets.append_seq_markers(text[:batch_size], seq_begin,
                                               seq_end)
            titles = datasets.append_seq_markers(titles[:batch_size],
                                                 seq_begin, seq_end)
            sentences = [
                datasets.append_seq_markers(sentence, seq_begin, seq_end)
                for sentence in sentences[:batch_size]
            ]

        if pad != 0:
            text = datasets.padseq(text[:batch_size], pad, raw)
            titles = datasets.padseq(titles[:batch_size], pad, raw)
            sentences = [
                datasets.padseq(sentence, pad, raw)
                for sentence in sentences[:batch_size]
            ]
        if sentence_pad != 0:
            sentences = [
                datasets.pad_sentences(sentence, pad, raw)
                for sentence in sentences[:batch_size]
            ]

        batch = self.Batch(text=text,
                           sentences=sentences,
                           ratings_service=ratings_service,
                           ratings_cleanliness=ratings_cleanliness,
                           ratings=ratings_overall,
                           ratings_value=ratings_value,
                           ratings_sleep_quality=ratings_sleep_quality,
                           ratings_rooms=ratings_rooms,
                           titles=titles,
                           helpful_votes=helpful_votes,
                           lengths=lengths)
        return batch