예제 #1
0
def predict(X, clf, vec, feat_obj=None):
    # Data -> features
    if feat_obj == None:
        feat_obj = FeaturesWrapper()
    feats  = feat_obj.extract_features(X)

    return predict_vectorized(feats, clf, vec)
예제 #2
0
 def __init__(self, model_path):
     # Load model
     with open(model_path + '.model', 'rb') as fid:
         self.clf = pickle.load(fid)
     with open(model_path + '.dict', 'rb') as fid:
         self.vec = pickle.load(fid)
     self.feat_obj = FeaturesWrapper()
예제 #3
0
def predict(X, clf, vec, feat_obj=None, prob=False):
    # Data -> features
    if feat_obj == None:
        feat_obj = FeaturesWrapper()
    feats = feat_obj.extract_features(X)

    if prob:
        return predict_probs_vectorized(feats, clf, vec)
    else:
        return predict_vectorized(feats, clf, vec)
예제 #4
0
def predict(X, clf, vec, feat_obj=None, prob=False):
    # Data -> features
    if feat_obj == None:
        feat_obj = FeaturesWrapper()
    feats  = feat_obj.extract_features(X)

    if prob:
        return predict_probs_vectorized(feats, clf, vec)
    else:
        return predict_vectorized(feats, clf, vec)
예제 #5
0
class TwitterHawk(object):
    def __init__(self, model_path):
        # Load model
        with open(model_path + '.model', 'rb') as fid:
            self.clf = pickle.load(fid)
        with open(model_path + '.dict', 'rb') as fid:
            self.vec = pickle.load(fid)
        self.feat_obj = FeaturesWrapper()

    def predict(self, X, predict_type='label'):
        # Data -> features
        #feat_obj = FeaturesWrapper()
        feats = self.feat_obj.extract_features(X)
        if predict_type == 'label':
            return self.predict_vectorized(feats)
        elif predict_type == 'probs':
            return self.predict_probs_vectorized(feats)

    def predict_probs_vectorized(self, feats):
        vectorized = self.vec.transform(feats)

        return self.clf.decision_function(vectorized)

    def predict_vectorized(self, feats):
        vectorized = self.vec.transform(feats)
        labels = self.clf.predict(vectorized)
        labels = [reverse_labels_map[y] for y in labels]
        return labels
예제 #6
0
 def __init__(self, model_path):
 # Load model
     with open(model_path+'.model', 'rb') as fid:
         self.clf = pickle.load(fid)
     with open(model_path+'.dict', 'rb') as fid:
         self.vec = pickle.load(fid)
     self.feat_obj = FeaturesWrapper()
예제 #7
0
class TwitterHawk(object):
    def __init__(self, model_path):
    # Load model
        with open(model_path+'.model', 'rb') as fid:
            self.clf = pickle.load(fid)
        with open(model_path+'.dict', 'rb') as fid:
            self.vec = pickle.load(fid)
        self.feat_obj = FeaturesWrapper()
    def predict(self, X, predict_type='label'):
    # Data -> features
        #feat_obj = FeaturesWrapper()
        feats  = self.feat_obj.extract_features(X)
        if predict_type == 'label':
            return self.predict_vectorized(feats)
        elif predict_type == 'probs':
            return self.predict_probs_vectorized(feats)

    def predict_probs_vectorized(self, feats):
        vectorized = self.vec.transform(feats)

        return self.clf.decision_function(vectorized)

    def predict_vectorized(self, feats):
        vectorized = self.vec.transform(feats)
        labels = self.clf.predict(vectorized)
        labels = [ reverse_labels_map[y] for y in labels ]
        return labels
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("-t",
                        dest="txt",
                        help="The files that contain the training examples",
                        default=os.path.join(BASE_DIR, 'data/annotated.txt'))

    parser.add_argument("-n",
                        dest="length",
                        help="Number of data points to use",
                        default=-1)

    parser.add_argument("-f",
                        dest="folds",
                        help="Number of folds to partition data into",
                        default=10)

    parser.add_argument("-r",
                        dest="random",
                        help="Random shuffling of input data.",
                        action='store_true',
                        default=False)

    # Parse the command line arguments
    args = parser.parse_args()

    # Decode arguments
    txt_files = glob.glob(args.txt)
    length = int(args.length)
    num_folds = int(args.folds)

    # Get data from files
    if not txt_files:
        print 'no training files :('
        sys.exit(1)

    notes = []
    for txt in txt_files:
        note_tmp = Note()
        note_tmp.read(txt)
        notes.append(note_tmp)

    # List of all data
    X = []
    Y = []
    for n in notes:
        # Data points
        x = [it for it in zip(n.sid_list(), n.text_list())]
        X += x

        # Labels
        y = [it for it in n.label_list()]
        Y += y

    # Limit length
    X = X[:length]
    Y = Y[:length]

    # Build confusion matrix
    confusion = [[0 for i in labels_map] for j in labels_map]

    # Instantiate feat obj once (it'd really slow down CV to rebuild every time)
    feat_obj = FeaturesWrapper()

    # Extract features once
    feats = train.extract_features(X, feat_obj)
    data = zip(feats, Y)

    # For each held-out test set
    i = 1
    for training, testing in cv_partitions(data[:length],
                                           num_folds=num_folds,
                                           shuffle=args.random):

        # Users like to see progress
        print 'Fold: %d of %d' % (i, num_folds)
        i += 1

        # Train on non-heldout data
        X_train = [d[0] for d in training]
        Y_train = [d[1] for d in training]
        vec, clf = train.train_vectorized(X_train,
                                          Y_train,
                                          model_path=None,
                                          grid=False)

        # Predict on held out
        X_test = [d[0] for d in testing]
        Y_test = [d[1] for d in testing]
        labels = predict.predict_vectorized(X_test, clf, vec)

        # Compute confusion matrix for held_out data
        testing_confusion = evaluate.create_confusion(labels, Y_test)
        confusion = add_matrix(confusion, testing_confusion)

    # Evaluate
    evaluate.display_confusion(confusion)
예제 #9
0
def extract_features(X, feat_obj=None):
    # Data -> features
    if feat_obj == None:
        feat_obj = FeaturesWrapper()
    feats = feat_obj.extract_features(X)
    return feats
예제 #10
0
def extract_features(X, feat_obj=None):
    # Data -> features
    if feat_obj == None:
        feat_obj = FeaturesWrapper()
    feats  = feat_obj.extract_features(X)
    return feats