def get_aggregated_vectors(google=True, data=SST_KAGGLE, average=True, dim=300): if google: model = read_google_model() else: model = read_glove_model(dim=dim) print "getting aggregate word vectors for documents..." if data == SST_KAGGLE: train_x, train_y, test_x = read_sst_kaggle_pickle() train_x = get_reviews_vectors(train_x, model, average=average, aggregate=True) test_x = get_reviews_vectors(test_x, model, average=average, aggregate=True) return train_x, train_y, test_x
def get_document_matrices(google=False, dim=100, cutoff=50, uniform=True, data='rotten', cv=True, huge=False): print "getting concatenated word vectors for documents..." model = read_google_model() if google else read_glove_model(dim=dim, huge=huge) if cv: if data == ROTTEN_TOMATOES: x, y = read_rotten_pickle() cutoff = 56 elif data == SUBJ: x, y = read_subj_pickle() elif data == CUSTOMER_REVIEW: cutoff = 45 x, y = read_cr_pickle() elif data == MPQA: x, y = read_mpqa_pickle() cutoff = 20 else: raise NotImplementedError('Not such cross validation data set %s', data) x = get_reviews_vectors(x, model, aggregate=False, cutoff=cutoff, uniform=uniform) x = np.asarray(x) y = np.asarray(y) return x, y else: if data == IMDB: train_x, train_y, validate_x, validate_y, test_x, test_y = read_imdb_pickle() cutoff = 75 elif data == SST_SENT: cutoff = 50 train_x, train_y, validate_x, validate_y, test_x, test_y = read_sst_sent_pickle() elif data == SST_SENT_POL: cutoff = 50 train_x, train_y, validate_x, validate_y, test_x, test_y = read_sst_sent_pickle(polarity=True) elif data == TREC: train_x, train_y, validate_x, validate_y, test_x, test_y = read_trec_pickle() cutoff = 30 else: raise NotImplementedError('Not such train/dev/test data set %s', data) train_x = get_reviews_vectors(train_x, model, aggregate=False, cutoff=cutoff, uniform=uniform) validate_x = get_reviews_vectors(validate_x, model, aggregate=False, cutoff=cutoff, uniform=uniform) test_x = get_reviews_vectors(test_x, model, aggregate=False, cutoff=cutoff, uniform=uniform) train_x = np.asarray(train_x) train_y = np.asarray(train_y) validate_x = np.asarray(validate_x) validate_y = np.asarray(validate_y) test_x = np.asarray(test_x) test_y = np.asarray(test_y) return train_x, train_y, validate_x, validate_y, test_x, test_y