from myWordcloud import my_wordcloud
from classifiers.__init__ import run_all_classifiers
from supportFuncs import stopWords

# Run everything:
if __name__ == '__main__':
    stop_words = stopWords.get_stop_words()
    usePipeline = False  # Pipeline currently not running crossValidation. Use the regular way.

    my_wordcloud(stop_words)
    run_all_classifiers(stop_words, usePipeline)
예제 #2
0
    clf.fit(vectorTrain, train_data['Category'])
    y_pred = clf.predict(vectorTest)

    # print "Train Accuracy :: ", accuracy_score(train_data['Category'], clf.predict(vectorTrain))
    # print "Test Accuracy :: ", accuracy_score(train_data['Category'], y_pred)

    #y_pred = cross_val_predict(clf, X=vectorTrain, y=vectorTest, cv=10, n_jobs=multiprocessing.cpu_count())
    writePredictionsToCsv.write_predictions_to_csv(y_pred, test_data,
                                                   dynamic_datasets_path)

    # Best GridSearch params
    # print clf.best_params_

    print("Elapsed time of successional-run: ",
          time.time() - start_time_successional)

    print('MyMethodClassifier finished!\n')
    return scores


# Run myMethodClassifier directly:
if __name__ == '__main__':
    dynamic_datasets_path = '..'
    data = readDatasets.read_dataset(dynamic_datasets_path)
    trainData = data[0]
    testData = data[1]

    my_method_classifier(stopWords.get_stop_words(), trainData, testData,
                         dynamic_datasets_path)
    exit()
        # GridSearch (find the best parameters)
        # parameters = {'n_neighbors': [5, 10], 'weights': ['uniform', 'distance'], 'algorithm' : ['ball_tree', 'kd_tree', 'brute'], 'p': [1, 2]}
        # svr = KNeighborsClassifier()
        # clf = GridSearchCV(svr, parameters)

        clf.fit(vectorTrain, train_y)
        y_pred = clf.predict(vectorTest)
        #
        print("Train Accuracy :: ", accuracy_score(train_y, clf.predict(vectorTrain)))
        print("Test Accuracy :: ", accuracy_score(test_y, y_pred))

        # Best GridSearch params
        # print clf.best_params_

        print("Elapsed time of successional-run: ", time.time() - start_time_successional)

    print('knnClassifier finished!\n')
    return scores


# Run svmClassifier directly:
if __name__ == '__main__':
    dynamic_datasets_path = '..'
    data = readDatasets.read_dataset(dynamic_datasets_path)
    trainData = data[0]
    testData = data[1]
    usePipeline = False

    knn(stopWords.get_stop_words(), trainData, testData, usePipeline)
    exit()
        # GridSearch
        # parameters = {'n_estimators': [130, 110, 100, 80, 50, 30, 20, 10]}
        # svr = RandomForestClassifier()
        # clf = GridSearchCV(svr, parameters)

        # clf.fit(vectorTrain, train_y)
        # y_pred = clf.predict(vectorTest)
        #
        # print "Train Accuracy :: ", accuracy_score(train_y, clf.predict(vectorTrain))
        # print "Test Accuracy :: ", accuracy_score(test_y, y_pred)

        # Best GridSearch params
        # print clf.best_params_

        print("Elapsed time of successional-run: ",
              time.time() - start_time_successional)

    print('rfClassifier finished!\n')
    return scores


# Run rfClassifier directly:
if __name__ == '__main__':
    data = readDatasets.read_dataset()
    trainData = data[0]
    testData = data[1]
    usePipeline = False

    rf_classifier(stopWords.get_stop_words(), trainData, testData, usePipeline)
예제 #5
0
        # print(neighbors)
        result = getResponse(neighbors)
        # print result
        predictions.append(result)
        # print('> predicted=' + repr(result) + ', actual=' + repr(train_data['Category'][8586+x]))
        if result == train_data['Category'][8586 + x]:
            count += 1

    # print 'Test', test_x[1:2], 'Pred', predictions[0]
    # accuracies.append(getAccuracy(test_data, predictions))
    # print('Accuracy: ' + repr(accuracy) + '%')
    print("Got right", count, "out of", 100)

    # Final accuracy after crossValidation
    # print accuracies
    # print np.mean(accuracies)

    print("Elapsed time of successional-run: ",
          time.time() - start_time_successional)


# Run knnClassifier directly:
if __name__ == '__main__':
    dynamic_datasets_path = '..'
    data = readDatasets.read_dataset(dynamic_datasets_path)
    trainData = data[0]
    testData = data[1]

    knn_classifier(stopWords.get_stop_words(), trainData, testData)
    exit()
예제 #6
0
    y = np.array(accuraccies)

    x_new = np.linspace(x.min(), x.max(), 500)

    f = interp1d(x, y, kind='quadratic')
    y_smooth = f(x_new)
    plt.xlabel('n_components')
    plt.ylabel('Accuracy')
    plt.title('RandomForestClassifier Accuracy Graph')
    plt.plot(x_new, y_smooth, color='r')
    plt.scatter(x, y)
    plt.savefig(
        os.path.join(dynamic_datasets_path, 'Resources', 'images',
                     'RandomForestClassifier Accuracy Graph.png'))
    plt.show()
    print('rfClassifier finished!\n')
    return scores


# Run rfClassifier directly:
if __name__ == '__main__':
    dynamic_datasets_path = '..'
    data = readDatasets.read_dataset(dynamic_datasets_path)
    trainData = data[0]
    testData = data[1]
    usePipeline = False

    rf_classifier_with_graph(stopWords.get_stop_words(), trainData, testData,
                             usePipeline)
    exit()
    with open('Resources/csv/train_set.csv', mode='r',
              encoding="utf8") as csvfile:
        csvReader = csv.DictReader(csvfile, delimiter='\t', quotechar='|')

        for row in csvReader:
            category = row["Category"]
            if category == 'Business':
                businessStr += row["Content"]
            elif category == 'Politics':
                politicsStr += row["Content"]
            elif category == 'Football':
                footballStr += row["Content"]
            elif category == 'Film':
                filmStr += row["Content"]
            elif category == 'Technology':
                technologyStr += row["Content"]

    show_wordcloud(stop_words, businessStr, 'Business')
    show_wordcloud(stop_words, politicsStr, 'Politics')
    show_wordcloud(stop_words, footballStr, 'Football')
    show_wordcloud(stop_words, filmStr, 'Film')
    show_wordcloud(stop_words, technologyStr, 'Technology')

    print('myWordcloud finished!\n')


# Run myWordcloud directly:
if __name__ == '__main__':
    my_wordcloud(stopWords.get_stop_words())
예제 #8
0
        # Write the scores.
        csvWriter.writerow(['Accuracy'] + ['{:.3}'.format(nbScores[0])] +
                           ['{:.3}'.format(rfScores[0])] +
                           ['{:.3}'.format(svmScores[0])] + ['knn'] +
                           ['{:.3}'.format(mymethodScores[0])])
        # + [knnScores[0]] + [mymethodScores[0]])
        csvWriter.writerow(['Precision'] + ['{:.3}'.format(nbScores[1])] +
                           ['{:.3}'.format(rfScores[1])] +
                           ['{:.3}'.format(svmScores[1])] + ['knn'] +
                           ['{:.3}'.format(mymethodScores[1])])
        # + [knnScores[1]]] + [mymethodScores[1]])
        csvWriter.writerow(['Recall'] + ['{:.3}'.format(nbScores[2])] +
                           ['{:.3}'.format(rfScores[2])] +
                           ['{:.3}'.format(svmScores[2])] + ['knn'] +
                           ['{:.3}'.format(mymethodScores[2])])
        # + [knnScores[2]] + [mymethodScores[2]])
        csvWriter.writerow(['F-Measure'] + ['{:.3}'.format(nbScores[3])] +
                           ['{:.3}'.format(rfScores[3])] +
                           ['{:.3}'.format(svmScores[3])] + ['knn'] +
                           ['{:.3}'.format(mymethodScores[3])])
        # + [knnScores[3]]] + [mymethodScores[3]])

    print('Finished writing to the outputCsvFile!')


# Run all classifiers:
if __name__ == '__main__':
    usePipeline = False
    run_all_classifiers(stopWords.get_stop_words(), usePipeline)
        csvReader = csv.DictReader(csvfile, delimiter='\t', quotechar='|')

        for row in csvReader:
            category = row["Category"]
            if category == 'Business':
                businessStr += row["Content"]
            elif category == 'Politics':
                politicsStr += row["Content"]
            elif category == 'Football':
                footballStr += row["Content"]
            elif category == 'Film':
                filmStr += row["Content"]
            elif category == 'Technology':
                technologyStr += row["Content"]

    show_wordcloud(stop_words, businessStr, 'Business', dynamic_datasets_path)
    show_wordcloud(stop_words, politicsStr, 'Politics', dynamic_datasets_path)
    show_wordcloud(stop_words, footballStr, 'Football', dynamic_datasets_path)
    show_wordcloud(stop_words, filmStr, 'Film', dynamic_datasets_path)
    show_wordcloud(stop_words, technologyStr, 'Technology',
                   dynamic_datasets_path)

    print('myWordcloud finished!\n')


# Run myWordcloud directly:
if __name__ == '__main__':
    dynamic_datasets_path = ''
    my_wordcloud(stopWords.get_stop_words(), dynamic_datasets_path)
    exit()