Exemplo n.º 1
0
 def __init__(self, model_path):
     # Load model
     with open(model_path + '.model', 'rb') as fid:
         self.clf = pickle.load(fid)
     with open(model_path + '.dict', 'rb') as fid:
         self.vec = pickle.load(fid)
     self.feat_obj = FeaturesWrapper()
Exemplo n.º 2
0
def predict(X, clf, vec, feat_obj=None, prob=False):
    # Data -> features
    if feat_obj == None:
        feat_obj = FeaturesWrapper()
    feats = feat_obj.extract_features(X)

    if prob:
        return predict_probs_vectorized(feats, clf, vec)
    else:
        return predict_vectorized(feats, clf, vec)
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("-t",
                        dest="txt",
                        help="The files that contain the training examples",
                        default=os.path.join(BASE_DIR, 'data/annotated.txt'))

    parser.add_argument("-n",
                        dest="length",
                        help="Number of data points to use",
                        default=-1)

    parser.add_argument("-f",
                        dest="folds",
                        help="Number of folds to partition data into",
                        default=10)

    parser.add_argument("-r",
                        dest="random",
                        help="Random shuffling of input data.",
                        action='store_true',
                        default=False)

    # Parse the command line arguments
    args = parser.parse_args()

    # Decode arguments
    txt_files = glob.glob(args.txt)
    length = int(args.length)
    num_folds = int(args.folds)

    # Get data from files
    if not txt_files:
        print 'no training files :('
        sys.exit(1)

    notes = []
    for txt in txt_files:
        note_tmp = Note()
        note_tmp.read(txt)
        notes.append(note_tmp)

    # List of all data
    X = []
    Y = []
    for n in notes:
        # Data points
        x = [it for it in zip(n.sid_list(), n.text_list())]
        X += x

        # Labels
        y = [it for it in n.label_list()]
        Y += y

    # Limit length
    X = X[:length]
    Y = Y[:length]

    # Build confusion matrix
    confusion = [[0 for i in labels_map] for j in labels_map]

    # Instantiate feat obj once (it'd really slow down CV to rebuild every time)
    feat_obj = FeaturesWrapper()

    # Extract features once
    feats = train.extract_features(X, feat_obj)
    data = zip(feats, Y)

    # For each held-out test set
    i = 1
    for training, testing in cv_partitions(data[:length],
                                           num_folds=num_folds,
                                           shuffle=args.random):

        # Users like to see progress
        print 'Fold: %d of %d' % (i, num_folds)
        i += 1

        # Train on non-heldout data
        X_train = [d[0] for d in training]
        Y_train = [d[1] for d in training]
        vec, clf = train.train_vectorized(X_train,
                                          Y_train,
                                          model_path=None,
                                          grid=False)

        # Predict on held out
        X_test = [d[0] for d in testing]
        Y_test = [d[1] for d in testing]
        labels = predict.predict_vectorized(X_test, clf, vec)

        # Compute confusion matrix for held_out data
        testing_confusion = evaluate.create_confusion(labels, Y_test)
        confusion = add_matrix(confusion, testing_confusion)

    # Evaluate
    evaluate.display_confusion(confusion)
Exemplo n.º 4
0
def extract_features(X, feat_obj=None):
    # Data -> features
    if feat_obj == None:
        feat_obj = FeaturesWrapper()
    feats = feat_obj.extract_features(X)
    return feats