예제 #1
0
def main():
    parser = create_parser()
    opts, args = parser.parse_args()
    if len(args) != 3:
        parser.error("invalid number of arguments")

    training_sqlite3, test_sqlite3, process_language = args

    print("Arguments: %s %s %s" %
          (training_sqlite3, test_sqlite3, process_language))

    #cwd = os.getcwd()
    #print "Working directory: %s" % (cwd)

    temporary_dir = opts.temporary_dir if opts.temporary_dir else P.dirname(
        training_sqlite3)
    if not P.isdir(temporary_dir):
        parser.error("error: temporary directory %s does not exist" %
                     temporary_dir)

    def log(message):
        print("[%s] %s" % (datetime.datetime.now().isoformat(), message))

    nlp = spacy.load(process_language)

    #
    # Training section
    #
    log("Preparing SQLite training database")
    training_sqlite = P.join(temporary_dir, training_sqlite3)
    prep_sqlite(training_sqlite)

    log("Calculating dimensions")
    calc_dim(nlp, training_sqlite, 0, False)
    log("Excluding dimensions")

    #if process_language == 'en':
    #    exclude_stopwords(training_sqlite)
    #else :
    #    exclude_stopwords_spacy(training_sqlite, process_language)

    exclude_stopwords_spacy(nlp, training_sqlite, process_language)

    exclude_non_alpha_partial(training_sqlite)
    exclude_unigrams_shorter_than(training_sqlite, 3)
    exclude_ngrams_shorter_than(training_sqlite, 1)
    log("Pruning excluded dimensions")
    prune(training_sqlite)
    log("Indexing training database")

    index(training_sqlite, nlp)

    log("Running mRMR algorithm to select features")
    mrmr(training_sqlite, temporary_dir)
    log("Pruning excluded dimensions (again)")
    prune(training_sqlite)

    log("Outputting training samples to temporary data file")
    training_samples = P.join(temporary_dir, "training-samples.dat")
    svmvec(training_sqlite, training_samples)

    log("Training classifier")
    classifier = P.join(temporary_dir, "classifier.svm")
    learn(training_sqlite, training_samples, classifier)

    #
    # Test section
    #
    log("Preparing SQLite test database")
    test_sqlite = P.join(temporary_dir, test_sqlite3)
    prep_sqlite(test_sqlite)

    log("Copying dimensions from training database to test database")
    copy_dim(training_sqlite, test_sqlite)

    log("Indexing test database")

    index(test_sqlite, nlp)

    log("Outputting test samples to temporary data file")
    test_samples = P.join(temporary_dir, "test-samples.dat")
    svmvec(test_sqlite, test_samples)

    log("Classifying test samples")
    classify(test_sqlite, test_samples, classifier, False, temporary_dir)
예제 #2
0
def main():
    parser = create_parser()
    opts, args = parser.parse_args()
    if len(args) != 3:
        parser.error("invalid number of arguments")

    mode, training_mdb, test_mdb = args
    if mode not in ("rank", "classify"):
        parser.error("invalid mode: %s" % mode)

    temporary_dir = opts.temporary_dir if opts.temporary_dir else P.dirname(training_mdb)
    if not P.isdir(temporary_dir):
        parser.error("error: temporary directory %s does not exist" % `temporary_dir`)

    def log(message):
        print "[%s] %s" % (datetime.datetime.now().isoformat(), message)

    #
    # Training section
    #
    log("Converting training database from MS Access to SQLite")
    training_sqlite = P.join(temporary_dir, P.splitext(P.basename(training_mdb))[0]+".sqlite3")
    mdb2sqlite(training_mdb, training_sqlite, zero_score=False)

    log("Calculating dimensions")
    calc_dim(training_sqlite)
    log("Excluding dimensions")
    exclude_stopwords(training_sqlite)
    exclude_non_alpha_partial(training_sqlite)
    exclude_shorter_than(training_sqlite, 3)
    log("Pruning excluded dimensions")
    prune(training_sqlite)
    log("Indexing training database")
    index(training_sqlite, IndexingOptions())
    log("Running mRMR algorithm to select features")
    mrmr(training_sqlite, temporary_dir)
    log("Pruning excluded dimensions (again)")
    prune(training_sqlite)

    log("Outputting training samples to temporary data file")
    training_samples = P.join(temporary_dir, "training-samples.dat")
    svmvec(training_sqlite, training_samples)

    log("Selecting best value for C parameter")
    best_c = parameter_selection(training_samples)

    log("Training classifier")
    classifier = P.join(temporary_dir, "classifier.svm.dlib")
    trainer(training_samples, classifier, best_c)

    #
    # Ranking section
    #
    log("Converting test database from MS Access to SQLite")
    test_sqlite = P.join(temporary_dir, P.splitext(P.basename(test_mdb))[0]+".sqlite3")
    mdb2sqlite(test_mdb, test_sqlite, zero_score=True)

    log("Copying dimensions from training database to test database")
    copy_dim(training_sqlite, test_sqlite)

    log("Indexing test database")
    index(test_sqlite, IndexingOptions())
    
    log("Outputting test samples to temporary data file")
    test_samples = P.join(temporary_dir, "test-samples.dat")
    svmvec(test_sqlite, test_samples)

    if mode == "rank":
        log("Ranking test samples")
        ranker(training_samples, test_samples, best_c)
    elif mode == "classify":
        log("Classifying test samples")
        dlib_classifier(test_sqlite, test_samples, classifier)
        #
        # This won't work on OS/X since the ODBC driver is read-only :(
        # 
        log("Copying scores to test database (MS Access)")
        copy_scores(test_sqlite, test_mdb)
    else:
        assert False