예제 #1
0
args = parser.parse_args()

if not os.path.exists(args.out_dir):
    os.makedirs(args.out_dir)

# load classifier
clf = joblib.load(args.classifier)

text_files = [fi for fi in os.listdir(args.in_dir) if fi.endswith(".txt")]
for i, text_file in enumerate(text_files):
    in_file = os.path.join(args.in_dir, text_file)
    print "{} of {}".format(i + 1, len(text_files))
    print "In:", in_file

    # load data
    X_train, X_data, Y_train, Y_data, classes_ = get_data(args.train_file, in_file)

    # classifiy
    pred = clf.predict(X_data)

    # save results
    out_file = os.path.join(args.out_dir, text_file)
    print "Out:", out_file

    X_data_with_ids, Y_data = load_data(in_file)

    with codecs.open(out_file, "wb", "utf8") as f:
        for x, y in zip(X_data_with_ids, pred):
            f.write(u"{}\t{}\n".format(x.decode("utf8"), "_".join(classes_[y]) or "None"))

    print
import string

import os

parser = argparse.ArgumentParser()
parser.add_argument('train_file', help='file containing the train data')
parser.add_argument('output_dir', help='directory to save the classifier to')
args = parser.parse_args()

stopwords = sw.words('dutch') + [p for p in string.punctuation]

if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

classifier_file = '{}/classifier.pkl'.format(args.output_dir)

X_train, X_test, Y_train, Y_test, classes_ = get_data(args.train_file,
                                                      args.train_file)

clf = make_pipeline(TfidfVectorizer(analyzer=split,
                                    stop_words=stopwords),
                    RandomKLabelsets(LinearSVC(class_weight='auto'),
                                     n_estimators=Y_train.shape[1]*2,
                                     labels_per_estimator=3))
clf.fit(X_train, Y_train)

# save classifier
joblib.dump(clf, classifier_file)

print('saved', classifier_file)
예제 #3
0
out_dir = args.out_dir
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

#classifier_dir = '{}/classifier/'.format(out_dir)
#if not os.path.exists(classifier_dir):
#    os.makedirs(classifier_dir)

for run in range(1, 11):
    print("Run", run)
    train_file = '{}/train_{}.txt'.format(args.input_dir, run)
    test_file = '{}/test_{}.txt'.format(args.input_dir, run)
    out_file = '{}/output_{}.txt'.format(out_dir, run)

    X_train, X_test, Y_train, Y_test, classes_ = get_data(train_file,
                                                          test_file)
    #print(Y_train.shape)

    clf = make_pipeline(TfidfVectorizer(analyzer=split,
                                        stop_words=stopwords),
                        RandomKLabelsets(LinearSVC(class_weight='auto'),
                                         n_estimators=Y_train.shape[1]*2,
                                         labels_per_estimator=3))
    clf.fit(X_train, Y_train)

    Y_pred = clf.predict(X_test)

    print_results(Y_test, Y_pred, classes_, open(out_file, 'w'))

# save classifier
#joblib.dump(clf, '{}/classifier.pkl'.format(classifier_dir))