示例#1
0
def main():
    # Load the dataset
    data_set = data.load_pickled_data()
    train_data = data_set['train']
    test_data = data_set['test']
    log('loaded dataset!')
    traindocs = [doc.content for doc in train_data if int(doc.rating) != 0]
    trainlabels = [
        int(doc.rating) for doc in train_data if int(doc.rating) != 0
    ]

    # Split the dataset
    if TEST_SIZE > 0:
        log('split dataset...')
        docs_train, docs_val, label_train, label_val = train_test_split(
            traindocs, trainlabels, test_size=TEST_SIZE, random_state=0)
    else:
        docs_train = traindocs
        label_train = trainlabels

    # Use prebuild model
    if not USE_BUILD_MODEL:
        log('make iterator...')
        it = LabeledLineSentence(docs_train, label_train)
        log('start training NN')
        d2v = train_model(it)
    else:
        d2v = gensim.models.Doc2Vec.load('Models/doc2vec_val.model')

    # Predict
    val_predictions = predict_val_set(d2v, docs_val)

    # Print the mae
    print 'MAE on validation set: ' + str(
        mean_absolute_error(label_val, val_predictions))
示例#2
0
def main():
    """
    This main flow of the program: 
    - Data is read and preprocessed
    - An ensemble model is trained using the train data
    - The validation set is predicted, different error metrics are printed
    - The test set is predicted and predictions are written to a file
    """
    log('Preprocessing data...')
    preproc = Preprocessor(a_value=BEST_A,
                           epsilon=BEST_EPSILON,
                           use_cached_features=USE_CACHED_FEATURES)
    X_train, X_val, y_train, y_val, X_test = preproc.load_and_preprocess()

    log('Training ensemble...')
    ensemble = VotingClassifier(estimators=[
        ('multinomial', MultinomialNB(alpha=0.01)),
        ('logistic_sag_balanced',
         LogisticRegression(solver='sag',
                            n_jobs=NUM_THREADS,
                            C=5,
                            tol=0.01,
                            class_weight='balanced')),
        ('logistic_lbfgs_balanced',
         LogisticRegression(solver='lbfgs',
                            n_jobs=NUM_THREADS,
                            C=5,
                            tol=0.01,
                            class_weight='balanced')),
    ],
                                voting='soft',
                                weights=[1, 1, 1])
    ensemble = ensemble.fit(X_train, y_train)

    # Uncomment when using a test_size > 0 in preprocessor.py
    # log('Predicting validation set...')
    # predictions_val = ensemble.predict(X_val)
    # if USE_CACHED_FEATURES:
    #     reviews = preproc.val_reviews
    # else:
    #     reviews = preproc.load_val_reviews()
    # predictions_val = fix_zero_predictions(predictions_val, reviews)
    # log('Validation error = %s' % str(mean_absolute_error(predictions_val, y_val)))
    # log(classification_report(predictions_val, y_val))
    # plot_confusion_matrix(confusion_matrix(y_val, predictions_val), classes=[1, 2, 3, 4, 5],
    #                       title='Normalized confusion matrix: validation set', filename='Plots/val_cnf_matrix.pdf')

    log('Predicting test set...')
    test_reviews = data.load_pickled_data()['test']
    test_content = [x.content for x in test_reviews]
    predictions_test = ensemble.predict(X_test)
    predictions_test = fix_zero_predictions(predictions_test, test_content)

    pred_file_name = utils.generate_unqiue_file_name(PREDICTIONS_BASENAME,
                                                     'csv')
    log('Dumping predictions to %s...' % pred_file_name)
    write_predictions_to_csv(predictions_test, pred_file_name)

    log('That\'s all folks!')
def main():
    data_set = data.load_pickled_data()
    train_data = data_set['train']
    test_data = data_set['test']

    histogram_ratings(train_data)
    histogram_prices(train_data + test_data)
    list_authors(train_data, test_data)
def main():
    data_set = data.load_pickled_data()
    train_data = data_set['train']
    test_data = data_set['test']

    non_ascii(train_data)
    exit()
    # dataset_info(train_data, test_data)
    # list_authors(train_data, test_data)
    histogram_ratings(train_data)
    histogram_amount_of_reviews_per_hotel(train_data)
    def load(self):
        """
        Load the pickeled dataset.

        parameters:
        :return list<Review> dataset['train']: A list of all train (and validation) reviews.
        :return list<Review> dataset['test']: A list of all test reviews.
        """
        log('Loading test and train data...')
        dataset = data.load_pickled_data()
        return dataset['train'], dataset['test']
def learn(parameter=0.5,
          classification_type='lsvc',
          generate_submission=False):
    """
    The actual learning algorithm. There's support for an optional parameter and a choice between some classification
    methods.

    :param parameter: the adjustable parameter for the classification
    :param classification_type:
    :param generate_submission: boolean, generate a submission or not
    :return: cross validation score guess
    """
    data_set = data.load_pickled_data()
    train_data_set = data_set['train']
    pipeline = get_pipeline(parameter, classification_type)

    # k-fold cross validation, with k='KFOLD_SPLITS'
    kfold = KFold(n_splits=KFOLD_SPLITS, shuffle=True)
    i = 1
    total_mae = 0

    # execute 'KFOLD_ITERATIONS' times
    for train_idx, test_idx in kfold.split(train_data_set):
        test_data = operator.itemgetter(*test_idx)(train_data_set)
        train_data = operator.itemgetter(*train_idx)(train_data_set)

        pipeline.fit(train_data, get_target(train_data))
        prediction = pipeline.predict(test_data)

        mae = cost_mae(prediction, get_target(test_data))
        total_mae += mae

        if i == KFOLD_ITERATIONS:
            break

        i += 1

    # calculate the final score guess as the mean of the fold scores
    mean_score_guess = total_mae / KFOLD_ITERATIONS

    # create a csv file to submit on Kaggle
    if generate_submission:
        test_data_set = data_set['test']
        pipeline.fit(train_data_set, get_target(train_data_set))
        predicted_ratings = pipeline.predict(test_data_set)
        dump_predictions(predicted_ratings, mean_score_guess)

    # return the score calculated by cross validation
    return mean_score_guess
示例#7
0
def main():
    if not os.path.exists(DEFAULT_PICKLE_PATH):
        print 'Creating pickle file...'
        data.create_pickled_data(overwrite_old=True)
        
    if not USE_CACHED_FEATURES:
        log('Loading test and train data...')
        dataset = data.load_pickled_data()
        
        log('Extracting features and target...')
        X_train, X_val, y_train, y_val, X_test = transform_data(dataset['train'], dataset['test'])
        
        print 'train feature shape: %d' % X_train.shape[1]
        print 'val feature shape: %d' % X_val.shape[1]
        
        dump_all(X_train, X_val, y_train, y_val, X_test)
    else:
        X_train, X_val, y_train, y_val, X_test = load_all()
    
    # Using TruncatedSVD
    #tsvd = TruncatedSVD(n_components=5000)
    #X_train = tsvd.fit_transform(X_train)
    #X_val = tsvd.fit_transform(X_val)
    #X_test = tsvd.fit_transform(X_test)
    
    log('Training model...')
    model = LogisticRegression(solver='sag', n_jobs=NUM_THEADS, C=5, tol=0.01)
    # Using RFE
    #model = RFE(model, n_features_to_select=80000, step=10000, verbose=1)
    model = model.fit(X_train, y_train)
    
    log('Predicting train and validation set...')
    predictions_train = model.predict(X_train)
    log('Train error = %s' % str(mean_absolute_error(predictions_train, y_train)))
    predictions_val = model.predict(X_val)
    log('Validation error = %s' % str(mean_absolute_error(predictions_val, y_val)))
    
    log('Predicting test set...')
    predictions_test = model.predict(X_test)
    
    pred_file_name = utils.generate_unqiue_file_name(PREDICTIONS_BASENAME, 'csv')
    log('Dumping predictions to %s...' % pred_file_name)
    write_predictions_to_csv(predictions_test, pred_file_name)
    
    log('That\'s all folks!')
    def load_and_preprocess(self):
        """
        Load data from pickle files and generate feature matrices, or load the cached ones.
        
        parameters:
        :return csr-matrix X_train: The preprocessed tf-idf feature matrix of the training samples.
        :return csr-matrix X_val: The preprocessed tf-idf feature matrix of the validation samples.
        :return numpy array y_train: The ratings corresponding to the samples in the training feature matrix.
        :return numpy array y_val: The ratings corresponding to the samples in the validation feature matrix.
        :return csr-matrix X_test: The preprocessed tf-idf feature matrix of the test samples.
        """

        # Check for pickled data
        if not os.path.exists(DEFAULT_PICKLE_PATH):
            print 'Creating pickle file...'
            data.create_pickled_data(overwrite_old=True)

        # Check for cached features
        if self.use_cached_features:
            try:
                data_tuple = self.load_all()
                feature_count = data_tuple[0].shape[1]
                log('Loaded the cached preprocessed data. ({} features)'.
                    format(feature_count))
                self.val_reviews = self.load_val_reviews()
                return data_tuple
            except IOError:
                self.use_cached_features = False
                log('Could not load cached preprocessed data! Doing the preprocessing now...'
                    )

        # Preprocess data
        if not self.use_cached_features:
            log('Loading test and train data...')
            dataset = data.load_pickled_data()

            log('Extracting features and reducing the dimensionality...')
            X_train, X_val, y_train, y_val, X_test = self.transform_data(
                dataset['train'], dataset['test'])

            # Save preprocessed data for later
            self.dump_all(X_train, X_val, y_train, y_val, X_test)
            self.dump_val_reviews(self.val_reviews)

        return X_train, X_val, y_train, y_val, X_test
def main():
    # Load the data
    data_set = data.load_pickled_data()
    train_data = data_set['train']
    test_data = data_set['test']
    log('loaded dataset!')
    traindocs = [doc.content for doc in train_data if int(doc.rating) != 0]
    trainlabels = [
        int(doc.rating) for doc in train_data if int(doc.rating) != 0
    ]

    # Split the data
    if TEST_SIZE > 0:
        log('split dataset...')
        docs_train, docs_val, label_train, label_val = train_test_split(
            traindocs, trainlabels, test_size=TEST_SIZE, random_state=0)
    else:
        docs_train = traindocs
        label_train = trainlabels

    # Use prebuild model
    if not USE_BUILD_MODEL:
        log('make iterator...')
        it = LabeledLineSentence(docs_train, label_train)
        log('start training NN')
        d2v = train_model(it)
    else:
        log('load pretrained model')
        d2v = gensim.models.Doc2Vec.load('Models/doc2vec0.2.model')

    train_features = fittransform_feature_matrix(d2v)
    log('start computing vectors for test data...')
    val_features = transform_feature_matrix(d2v, docs_val)

    # Actual classification
    logistic = LogisticRegression(solver='sag', n_jobs=4, C=1, tol=0.1)
    logistic.fit(train_features, label_train)
    predictions = logistic.predict(val_features)
    log('Validation error = %s' %
        str(mean_absolute_error(predictions, label_val)))
    log(classification_report(predictions, label_val))
示例#10
0
def main():
    data.create_pickled_data(overwrite_old=True)
    dataset = data.load_pickled_data()
    train_set = dataset['train']
    test_set = dataset['test']
示例#11
0
	When working with sparse matrices, no deep copies are made.
	imput arguments:
		M: the matrix from which to remove rows
		idx_to_drop: the indices of the rows to remove
	output arguments:
		M[mask]: csr matrix, with only the remaining rows
	"""
    if not isinstance(M, scipy.sparse.csr_matrix):
        raise ValueError("works only for CSR format -- use .tocsr() first")
    indices = list(idx_to_drop)
    mask = np.ones(M.shape[0], dtype=bool)
    mask[idx_to_drop] = False
    return M[mask]


if __name__ == '__main__':
    log('Loading test and train data...')
    dataset = data.load_pickled_data()
    train_data = dataset['train']

    # Split training data in 4 folds (shuffle folds, because of random hotel split)
    # Preprocess and dump
    counter = 1
    for x in range(4):
        log('Extracting features and reducing the dimensionality for fold ' +
            str(counter))
        pp = Preprocessor(a_value=11, epsilon=0.1, reduction_level=0.025)
        X_train, X_val, y_train, y_val = pp.transform_data(train_data)
        dump_all(X_train, X_val, y_train, y_val, counter)
        dump_val_reviews(pp.val_reviews, counter)
        counter += 1