def main(run=None):
    init_random()
    log_to_info('starting main')

    if run:
        slave_options = run['algo_params']
        run_with(**slave_options)
    else:
        start_manual_run(run_and_say)
def classify(mp, x_test, x_train, y_train):
    log_to_info("Starting classification")
    classifiers = dict(
        logistic=LogisticClassifier,
        logistic2=LogisticClassifier2,
        rbf=RbfClassifier,
        rbf_scv=RbfSVCClassifier,
        svc=LinearSVCClassifier,
    )
    return classifiers[mp.classifier_name]().classify(mp, x_train, y_train, x_test)
    def classify(self, mp, train_centroids, training_data_sentiment, test_centroids, testing_data_ids):
        # ****** Fit a random forest and extract predictions
        clf = RandomForestClassifier(n_estimators=mp.random_forest_estimators)

        # Fitting the forest may take a few minutes
        log_to_info('Fitting a random forest to labeled training data...')
        clf = clf.fit(train_centroids, training_data_sentiment)
        result = clf.predict(test_centroids)

        # Write the test results
        return pd.DataFrame(data={'id': testing_data_ids, 'sentiment': result})
Exemplo n.º 4
0
    def classify(self, mp, train_centroids, training_data_sentiment,
                 test_centroids, testing_data_ids):
        # ****** Fit a random forest and extract predictions
        clf = RandomForestClassifier(n_estimators=mp.random_forest_estimators)

        # Fitting the forest may take a few minutes
        log_to_info('Fitting a random forest to labeled training data...')
        clf = clf.fit(train_centroids, training_data_sentiment)
        result = clf.predict(test_centroids)

        # Write the test results
        return pd.DataFrame(data={'id': testing_data_ids, 'sentiment': result})
def slave_main():
    log_handler, log_buffer = setup_logger(True)
    run = get_random_pending_run()
    # noinspection PyBroadException
    try:
        if not run:
            log_to_info('Nothing to start, exiting')
            return
        # log_to_info('NothingScore: 77.7% to start, exiting')
        main(run=run)
    except Exception:
        logging.exception('Unknown error')

    output = get_log_output(log_handler, log_buffer)
    report_results(run['id'], output)
def train_tfidf_vectors(mp, testing_reviews, training_reviews, x_train, x_test):
    log_to_info('Training tfid vectors')
    tfv = TfidfVectorizer(max_features=mp.tfid_features)
    tr = list(training_reviews['words'].values)
    te = list(testing_reviews['words'].values)
    space_separated_words = tr + te
    tfid_vectors = tfv.fit_transform(space_separated_words)
    x_train_tfid = tfid_vectors[:len(tr)].toarray()
    x_test_tfid = tfid_vectors[len(tr):].toarray()

    if x_train is None:
        x_train = x_train_tfid
        x_test = x_test_tfid
    else:
        x_train = np.concatenate([np.array(list(x_train)), x_train_tfid], axis=1)
        x_test = np.concatenate([np.array(list(x_test)), x_test_tfid], axis=1)

    return x_train, x_test
    def classify(self,
                 model,
                 y_test_reviews,
                 reviews,
                 nearest_reviews_count=2000):
        y_test = np.zeros(len(y_test_reviews))

        i = 0
        for d2v_numeric_id in y_test_reviews['d2v_id']:
            if i % 100 == 0:
                log_to_info('Processing {0} of {1} ({2}%)'.format(
                    i, len(y_test_reviews),
                    1.0 * i / len(y_test_reviews) * 100.0))
            d2v_id = get_d2v_identifier(d2v_numeric_id)
            arr = model.most_similar(d2v_id, topn=10000)
            sentiment_sum = 0.0
            total_neareness = 0.0
            total_sentiments = 0
            for key in arr:
                if key[0].startswith('REVIEW_'):
                    most_similar_review = key[0]
                    most_similar_id = int(most_similar_review.split('_')[1])
                    r = reviews[reviews['d2v_id'].eq(most_similar_id)]
                    if r['use_for_classifier_training'].all():
                        sentiment = r['sentiment'].values[0]
                        nearness = key[1]
                        sentiment_sum += sentiment * nearness
                        total_neareness += nearness
                        total_sentiments += 1
                        if total_sentiments >= nearest_reviews_count:
                            break
            # log_to_info('{0} predicts {1}'.format(d2v_id, sentiment))

            if total_neareness == 0:
                log_to_info('key {0} has no similar review!'.format(d2v_id))
                y_test[i] = 0
            else:
                sentiment = 0 if sentiment_sum <= total_neareness * 0.5 else 1
                y_test[i] = sentiment

            i += 1

        return y_test
Exemplo n.º 8
0
def train_tfidf_vectors(mp, testing_reviews, training_reviews, x_train,
                        x_test):
    log_to_info('Training tfid vectors')
    tfv = TfidfVectorizer(max_features=mp.tfid_features)
    tr = list(training_reviews['words'].values)
    te = list(testing_reviews['words'].values)
    space_separated_words = tr + te
    tfid_vectors = tfv.fit_transform(space_separated_words)
    x_train_tfid = tfid_vectors[:len(tr)].toarray()
    x_test_tfid = tfid_vectors[len(tr):].toarray()

    if x_train is None:
        x_train = x_train_tfid
        x_test = x_test_tfid
    else:
        x_train = np.concatenate([np.array(list(x_train)), x_train_tfid],
                                 axis=1)
        x_test = np.concatenate([np.array(list(x_test)), x_test_tfid], axis=1)

    return x_train, x_test
def build_train_and_test_x(mp, reviews):
    log_to_info('Starting to load data')

    training_reviews = reviews[reviews['use_for_classifier_training'].eq(True)]
    testing_reviews = reviews[reviews['predict_sentiment'].eq(True)]

    x_train = None
    x_test = None

    if mp.word_vector_dimensionality:
        log_to_info('Starting to load the doc2vec model')
        model_dm = Doc2VecFactory(mp, reviews, 1).get_word2vec_model()
        model_dbow = Doc2VecFactory(mp, reviews, 0).get_word2vec_model()

        x_train = training_reviews['d2v_id'].map(lambda d2v_id: convert_to_vector(d2v_id, model_dm, model_dbow)).values
        x_test = testing_reviews['d2v_id'].map(lambda d2v_id: convert_to_vector(d2v_id, model_dm, model_dbow)).values

    if mp.tfid_features:
        x_train, x_test = train_tfidf_vectors(mp, testing_reviews, training_reviews, x_train, x_test)

    y_train = list(training_reviews['sentiment'].values)
    return list(x_train), y_train, list(x_test)
    def classify(self, model, y_test_reviews, reviews, nearest_reviews_count=2000):
        y_test = np.zeros(len(y_test_reviews))

        i = 0
        for d2v_numeric_id in y_test_reviews["d2v_id"]:
            if i % 100 == 0:
                log_to_info(
                    "Processing {0} of {1} ({2}%)".format(i, len(y_test_reviews), 1.0 * i / len(y_test_reviews) * 100.0)
                )
            d2v_id = get_d2v_identifier(d2v_numeric_id)
            arr = model.most_similar(d2v_id, topn=10000)
            sentiment_sum = 0.0
            total_neareness = 0.0
            total_sentiments = 0
            for key in arr:
                if key[0].startswith("REVIEW_"):
                    most_similar_review = key[0]
                    most_similar_id = int(most_similar_review.split("_")[1])
                    r = reviews[reviews["d2v_id"].eq(most_similar_id)]
                    if r["use_for_classifier_training"].all():
                        sentiment = r["sentiment"].values[0]
                        nearness = key[1]
                        sentiment_sum += sentiment * nearness
                        total_neareness += nearness
                        total_sentiments += 1
                        if total_sentiments >= nearest_reviews_count:
                            break
            # log_to_info('{0} predicts {1}'.format(d2v_id, sentiment))

            if total_neareness == 0:
                log_to_info("key {0} has no similar review!".format(d2v_id))
                y_test[i] = 0
            else:
                sentiment = 0 if sentiment_sum <= total_neareness * 0.5 else 1
                y_test[i] = sentiment

            i += 1

        return y_test
    def classify(self, mp, x_train, y_train, x_test):
        clf = SVC(cache_size=6000, verbose=True)
        log_to_info('Fitting a RBF SVC to labeled training data...')
        clf = clf.fit(x_train, y_train)
        log_to_info('Predicting test value')
        y_test = clf.predict(x_test)
        log_to_info('Done!')

        return y_test
Exemplo n.º 12
0
    def classify(self, mp, x_train, y_train, x_test):
        clf = LinearSVC(dual=False, verbose=False, C=1.0)
        log_to_info('Fitting a LinearSVC to labeled training data...')
        clf = clf.fit(x_train, y_train)
        log_to_info('Predicting test value')
        y_test = clf.predict(x_test)
        log_to_info('Done!')

        return y_test
Exemplo n.º 13
0
    def classify(self, mp, x_train, y_train, x_test):
        clf = SVC(cache_size=6000, verbose=True)
        log_to_info('Fitting a RBF SVC to labeled training data...')
        clf = clf.fit(x_train, y_train)
        log_to_info('Predicting test value')
        y_test = clf.predict(x_test)
        log_to_info('Done!')

        return y_test
    def classify(self, mp, x_train, y_train, x_test):
        feature_map_nystroem = Nystroem(random_state=1, gamma=1.1, n_components=1000)  # gamma=0.00005,
        clf = pipeline.Pipeline([("feature_map", feature_map_nystroem), ("svm", LinearSVC())])
        log_to_info('Fitting a RBF SVM to labeled training data...')
        clf = clf.fit(x_train, y_train)
        log_to_info('Predicting test value')
        y_test = clf.predict(x_test)
        log_to_info('Done!')

        return y_test
    def classify(self, mp, x_train, y_train, x_test):
        x_train_reg = sm.add_constant(x_train)
        x_test_reg = sm.add_constant(x_test)
        logit =  sm.Logit(y_train, x_train_reg)
        clf = logit.fit(disp=0)
        # print(clf.summary())
        log_to_info('Fitting a Logistic Regression to labeled training data...')
        log_to_info('Predicting test value')
        y_test = clf.predict(x_test_reg)
        log_to_info('Done!')

        return numpy.rint(y_test)
Exemplo n.º 16
0
    def classify(self, mp, x_train, y_train, x_test):
        feature_map_nystroem = Nystroem(random_state=1,
                                        gamma=1.1,
                                        n_components=1000)  # gamma=0.00005,
        clf = pipeline.Pipeline([("feature_map", feature_map_nystroem),
                                 ("svm", LinearSVC())])
        log_to_info('Fitting a RBF SVM to labeled training data...')
        clf = clf.fit(x_train, y_train)
        log_to_info('Predicting test value')
        y_test = clf.predict(x_test)
        log_to_info('Done!')

        return y_test
    def classify(self, mp, x_train, y_train, x_test):
        x_train_reg = sm.add_constant(x_train)
        x_test_reg = sm.add_constant(x_test)
        logit = sm.Logit(y_train, x_train_reg)
        clf = logit.fit(disp=0)
        # print(clf.summary())
        log_to_info(
            'Fitting a Logistic Regression to labeled training data...')
        log_to_info('Predicting test value')
        y_test = clf.predict(x_test_reg)
        log_to_info('Done!')

        return numpy.rint(y_test)
def run_with(**kwargs):
    log_to_info(str(kwargs))
    check_paths_and_create_directories()
    start = time.time()
    mp = ModelParameters(**kwargs)

    reviews = DataPreparer(mp).convert()
    x_train, y_train, x_test = build_train_and_test_x(mp, reviews)

    y_test_reviews = reviews[reviews['predict_sentiment'].eq(True)].copy()
    y_test_reviews['predicted_sentiment'] = classify(mp, x_test, x_train, y_train)
    log_to_info(str(kwargs))
    ScoreCalculator().print_score(y_test_reviews)
    end = time.time()
    log_to_info('It took {0} seconds'.format(end - start))
    def classify(self, x_train, y_train, x_test):
        x_train = np.array(x_train)
        y_train = np.array(y_train)
        x_test = np.array(x_test)
        # nn = Classifier(
        #    layers=[
        #        Layer("Maxout", units=100, pieces=2),
        #        Layer("Softmax")],
        #    learning_rate=0.001,
        #    n_iter=25)
        # nn.fit(x_train, y_train)
        # y_test = nn.predict(np.array(x_test))

        nn = Regressor(layers=[Layer('Rectifier', units=400), Layer('Linear')], learning_rate=0.02, n_iter=10)
        log_to_info('Fitting a NN to labeled training data...')
        nn.fit(np.array(x_train), np.array(y_train))
        log_to_info('Predicting test value')
        y_test = nn.predict(np.array(x_test))
        log_to_info('Done!')

        return y_test
Exemplo n.º 20
0
 def classify(self, mp, x_train, y_train, x_test):
     x_train = sm.add_constant(x_train)
     x_test = sm.add_constant(x_test)
     clf = LogisticRegressionCV(verbose=1, cv=5)
     log_to_info(
         'Fitting a Logistic Regression to labeled training data...')
     clf = clf.fit(x_train, y_train)
     log_to_info('Training details')
     log_to_info('Classifier parameters: {}'.format(clf.get_params()))
     log_to_info('On training: {}'.format(
         clf.score(x_train, y_train) * 100.0))
     log_to_info('Predicting test value')
     y_test = clf.predict(x_test)
     log_to_info('Done!')
     return y_test
def start_manual_run(func):
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=3, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=2, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=1, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')

    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=20, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=15, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=10, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=5, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=4, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=3, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=2, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=1, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')

    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=3, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='exp')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=2, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='exp')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=1, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='exp')

    # run_and_say(algorithm_version=3, word_vector_dimensionality=48, word_context_window=10, frequent_words_downsampling=1e-4,
    # negative=17, hierarchical_paragraph_vectors=1, epochs=1, classifier_name='rbf')

    version = 77.8
    # epochs = list(range(1, 5)) + list(range(5, 76, 5))
    epochs = list(range(1, 6))
    # for epoch in epochs:
    # log_to_info('Epoch exp: {}'.format(epoch))
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=10, frequent_words_downsampling=0, negative=5,
    #      hierarchical_paragraph_vectors=0, epochs=epoch, epochs_total=max(epochs), classifier_name='rbf', classifier_c=0.0195,
    #      classifier_penalty='l2', tfid_features=0, learning_rate_type='exp')
    # log_to_info('Epoch was exp: {}'.format(epoch))
    epochs = [3]
    epoch = max(epochs)
    log_to_info('Epoch linear: {}'.format(epoch))
    hpv = 0
    func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=10, frequent_words_downsampling_dm=0.0001,
         frequent_words_downsampling_dbow=0.01, negative=25, hierarchical_paragraph_vectors=hpv, epochs=epoch, epochs_total=max(epochs),
         classifier_name='svc', classifier_c=0.0195, classifier_penalty='l2', tfid_features=0, learning_rate_type='exp',
         experiment_number=1)
    # func(algorithm_version=version, word_vector_dimensionality=100, word_context_window=10, frequent_words_downsampling=0, negative=5,
    #     hierarchical_paragraph_vectors=5, epochs=epoch, epochs_total=max(epochs), classifier_name='svc', classifier_c=0.0195,
    #     classifier_penalty='l2', tfid_features=0, learning_rate_type='linear')
    log_to_info('Epoch was linear: {}'.format(epoch))
    def train_model(self):
        log_to_info('Loading training sentences')

        review_d2v_id_list = zip(self.reviews['words'], self.reviews['d2v_id'], self.reviews['best_topics'],
                                 self.reviews['second_best_topics'])

        labeled_reviews = []
        if self.dm == 0:
            log_to_info('applying dbow with hpv={}'.format(self.mp.hierarchical_paragraph_vectors_dbow))
        elif self.dm == 1:
            log_to_info('applying dm with hpv={}'.format(self.mp.hierarchical_paragraph_vectors_dm))
            
        for space_separated_words, d2v_id, best_topic, second_best_topic in review_d2v_id_list:
            if self.dm == 0:
                labeled_reviews.extend(
                    convert_to_labeled_review(self.mp.hierarchical_paragraph_vectors_dbow, space_separated_words, d2v_id, best_topic,
                                              second_best_topic))
            elif self.dm == 1:
                labeled_reviews.extend(
                    convert_to_labeled_review(self.mp.hierarchical_paragraph_vectors_dm, space_separated_words, d2v_id, best_topic,
                                              second_best_topic))

        log_to_info('Loading Doc2Vec model...')
        start_epoch = self.epochs + 1
        model = None
        for epoch in range(self.epochs, 0, -1):
            model = self.load_model(epoch)
            if model:
                log_to_info('Found model in cache!')
                break
            start_epoch = epoch

        if not model:
            if self.dm == 0:
                # PV-DBOW
                log_to_info('Yep, this is DBOW!')
                model = Doc2Vec(dm=self.dm, hs=0, workers=self.workers, size=self.mp.word_vector_dimensionality, min_count=self.min_count,
                                window=self.mp.word_context_window, sample=self.mp.frequent_words_downsampling_dbow, seed=random_int(),
                                negative=self.mp.negative)
                # model = Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=8)
            elif self.dm == 1:
                # PV-DM w/average
                log_to_info('Yep, this is DM!')
                model = Doc2Vec(dm=self.dm, dm_mean=1, hs=0, workers=self.workers, size=self.mp.word_vector_dimensionality,
                                min_count=self.min_count, window=self.mp.word_context_window, sample=self.mp.frequent_words_downsampling_dm,
                                seed=random_int(), negative=self.mp.negative)
                # model = Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=8)

            start1 = time.time()
            model.build_vocab(labeled_reviews)
            start1 = time.time()
            end1 = time.time()
            log_to_info('Vocab building for dm{0} took {1} seconds'.format(self.dm, end1 - start1))

        log_to_info('Training Doc2Vec model...')
        for epoch in range(start_epoch, self.epochs + 1):
            log_to_info('Epoch {0} of {1}'.format(epoch, self.epochs))
            m = self.load_model(epoch)
            if m is not None:
                log_to_info('Found model in cache!')
                model = m
                continue

            permuted_labeled_reviews = labeled_reviews[:]
            random.shuffle(permuted_labeled_reviews)

            alpha = alpha_for_epoch(epoch, self.mp.epochs_total, self.mp.alpha_max, self.mp.alpha_min, self.mp.learning_rate_type)
            model.min_alpha, model.alpha = alpha, alpha

            start2 = time.time()
            model.train(permuted_labeled_reviews)
            end2 = time.time()
            log_to_info('DM HPV is {0}, DBOW HPV is {1}'.format(self.mp.hierarchical_paragraph_vectors_dm,
                                                                self.mp.hierarchical_paragraph_vectors_dbow))
            log_to_info('Model training for dm{0} took {1} seconds'.format(self.dm, end2 - start2))

            self.model = model
            self.store_model(epoch)

        self.model = model
Exemplo n.º 23
0
def main():
    setup_logger()
    init_random()
    log_to_info('starting main')
    start = time.time()

    reviews = _get_reviews()
    if 'best_topics' in reviews:
        log_to_info('best topics already set, aborting!')
        return

    log_to_info('getting reviews done')

    if not os.path.exists(dictionary_cache_name) or not os.path.exists(mm_cache_name):
        documents = _get_documents(reviews)

    log_to_info('dictionary')

    if os.path.exists(dictionary_cache_name):
        dictionary = gensim.corpora.Dictionary.load(dictionary_cache_name)
    else:
        dictionary = gensim.corpora.Dictionary(documents)
        dictionary.save(dictionary_cache_name)

    log_to_info('mm')
    if os.path.exists(mm_cache_name):
        corpus = gensim.corpora.MmCorpus(mm_cache_name)
    else:
        corpus = [dictionary.doc2bow(text) for text in documents]
        gensim.corpora.MmCorpus.serialize(mm_cache_name, corpus)

    log_to_info('lda')
    if os.path.exists(lda_cache_name):
        if model_type == 'lsi':
            lda = gensim.models.LsiModel.load(lda_cache_name)
        else:
            lda = gensim.models.LdaModel.load(lda_cache_name)
    else:
        if model_type == 'lsi':
            lda = gensim.models.LsiModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        elif single_pass:
            lda = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)
        else:
            lda = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, update_every=0, passes=20)
        lda.save(lda_cache_name)

    log_to_info('it took {0} seconds'.format(time.time() - start))
    infer_topics(lda, reviews)
    log_to_info('everything took {0} seconds'.format(time.time() - start))
 def print_score(self, reviews):
     score, correct, total = self.calculate_score(reviews)
     log_to_info("Score: {0}%, which is {1}/{2}".format(score, correct, total))
Exemplo n.º 25
0
def classify(mp, x_test, x_train, y_train):
    log_to_info('Starting classification')
    classifiers = dict(logistic=LogisticClassifier, logistic2=LogisticClassifier2, rbf=RbfClassifier, rbf_scv=RbfSVCClassifier,
                       svc=LinearSVCClassifier)
    return classifiers[mp.classifier_name]().classify(mp, x_train, y_train, x_test)
def start_manual_run(func):
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=3, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=2, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=1, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')

    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=20, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=15, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=10, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=5, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=4, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=3, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=2, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=1, epochs_total=20, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='linear')

    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=3, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='exp')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=2, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='exp')
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=16, frequent_words_downsampling=1e-4, negative=17,
    #      hierarchical_paragraph_vectors=0, epochs=1, epochs_total=3, classifier_name='rbf', classifier_c=0.0195, classifier_penalty='l2',
    #      tfid_features=0, learning_rate_type='exp')

    # run_and_say(algorithm_version=3, word_vector_dimensionality=48, word_context_window=10, frequent_words_downsampling=1e-4,
    # negative=17, hierarchical_paragraph_vectors=1, epochs=1, classifier_name='rbf')

    version = 77.8
    # epochs = list(range(1, 5)) + list(range(5, 76, 5))
    epochs = list(range(1, 6))
    # for epoch in epochs:
    # log_to_info('Epoch exp: {}'.format(epoch))
    # func(algorithm_version=version, word_vector_dimensionality=48, word_context_window=10, frequent_words_downsampling=0, negative=5,
    #      hierarchical_paragraph_vectors=0, epochs=epoch, epochs_total=max(epochs), classifier_name='rbf', classifier_c=0.0195,
    #      classifier_penalty='l2', tfid_features=0, learning_rate_type='exp')
    # log_to_info('Epoch was exp: {}'.format(epoch))
    epochs = [3]
    epoch = max(epochs)
    log_to_info('Epoch linear: {}'.format(epoch))
    hpv = 0
    func(algorithm_version=version,
         word_vector_dimensionality=48,
         word_context_window=10,
         frequent_words_downsampling_dm=0.0001,
         frequent_words_downsampling_dbow=0.01,
         negative=25,
         hierarchical_paragraph_vectors=hpv,
         epochs=epoch,
         epochs_total=max(epochs),
         classifier_name='svc',
         classifier_c=0.0195,
         classifier_penalty='l2',
         tfid_features=0,
         learning_rate_type='exp',
         experiment_number=1)
    # func(algorithm_version=version, word_vector_dimensionality=100, word_context_window=10, frequent_words_downsampling=0, negative=5,
    #     hierarchical_paragraph_vectors=5, epochs=epoch, epochs_total=max(epochs), classifier_name='svc', classifier_c=0.0195,
    #     classifier_penalty='l2', tfid_features=0, learning_rate_type='linear')
    log_to_info('Epoch was linear: {}'.format(epoch))
def force_exists(path):
    if not os.path.exists(path):
        log_to_info('The file {0} does not exist!'.format(path))
        exit(1)
 def classify(self, mp, x_train, y_train, x_test):
     x_train = sm.add_constant(x_train)
     x_test = sm.add_constant(x_test)
     clf = LogisticRegressionCV(verbose=1, cv=5)
     log_to_info('Fitting a Logistic Regression to labeled training data...')
     clf = clf.fit(x_train, y_train)
     log_to_info('Training details')
     log_to_info('Classifier parameters: {}'.format(clf.get_params()))
     log_to_info('On training: {}'.format(clf.score(x_train, y_train) * 100.0))
     log_to_info('Predicting test value')
     y_test = clf.predict(x_test)
     log_to_info('Done!')
     return y_test
Exemplo n.º 29
0
 def print_score(self, reviews):
     score, correct, total = self.calculate_score(reviews)
     log_to_info('Score: {0}%, which is {1}/{2}'.format(
         score, correct, total))
def force_exists(path):
    if not os.path.exists(path):
        log_to_info('The file {0} does not exist!'.format(path))
        exit(1)