def predict(X, clf, vec, feat_obj=None): # Data -> features if feat_obj == None: feat_obj = FeaturesWrapper() feats = feat_obj.extract_features(X) return predict_vectorized(feats, clf, vec)
def __init__(self, model_path): # Load model with open(model_path + '.model', 'rb') as fid: self.clf = pickle.load(fid) with open(model_path + '.dict', 'rb') as fid: self.vec = pickle.load(fid) self.feat_obj = FeaturesWrapper()
def predict(X, clf, vec, feat_obj=None, prob=False): # Data -> features if feat_obj == None: feat_obj = FeaturesWrapper() feats = feat_obj.extract_features(X) if prob: return predict_probs_vectorized(feats, clf, vec) else: return predict_vectorized(feats, clf, vec)
class TwitterHawk(object): def __init__(self, model_path): # Load model with open(model_path + '.model', 'rb') as fid: self.clf = pickle.load(fid) with open(model_path + '.dict', 'rb') as fid: self.vec = pickle.load(fid) self.feat_obj = FeaturesWrapper() def predict(self, X, predict_type='label'): # Data -> features #feat_obj = FeaturesWrapper() feats = self.feat_obj.extract_features(X) if predict_type == 'label': return self.predict_vectorized(feats) elif predict_type == 'probs': return self.predict_probs_vectorized(feats) def predict_probs_vectorized(self, feats): vectorized = self.vec.transform(feats) return self.clf.decision_function(vectorized) def predict_vectorized(self, feats): vectorized = self.vec.transform(feats) labels = self.clf.predict(vectorized) labels = [reverse_labels_map[y] for y in labels] return labels
def __init__(self, model_path): # Load model with open(model_path+'.model', 'rb') as fid: self.clf = pickle.load(fid) with open(model_path+'.dict', 'rb') as fid: self.vec = pickle.load(fid) self.feat_obj = FeaturesWrapper()
class TwitterHawk(object): def __init__(self, model_path): # Load model with open(model_path+'.model', 'rb') as fid: self.clf = pickle.load(fid) with open(model_path+'.dict', 'rb') as fid: self.vec = pickle.load(fid) self.feat_obj = FeaturesWrapper() def predict(self, X, predict_type='label'): # Data -> features #feat_obj = FeaturesWrapper() feats = self.feat_obj.extract_features(X) if predict_type == 'label': return self.predict_vectorized(feats) elif predict_type == 'probs': return self.predict_probs_vectorized(feats) def predict_probs_vectorized(self, feats): vectorized = self.vec.transform(feats) return self.clf.decision_function(vectorized) def predict_vectorized(self, feats): vectorized = self.vec.transform(feats) labels = self.clf.predict(vectorized) labels = [ reverse_labels_map[y] for y in labels ] return labels
def main(): parser = argparse.ArgumentParser() parser.add_argument("-t", dest="txt", help="The files that contain the training examples", default=os.path.join(BASE_DIR, 'data/annotated.txt')) parser.add_argument("-n", dest="length", help="Number of data points to use", default=-1) parser.add_argument("-f", dest="folds", help="Number of folds to partition data into", default=10) parser.add_argument("-r", dest="random", help="Random shuffling of input data.", action='store_true', default=False) # Parse the command line arguments args = parser.parse_args() # Decode arguments txt_files = glob.glob(args.txt) length = int(args.length) num_folds = int(args.folds) # Get data from files if not txt_files: print 'no training files :(' sys.exit(1) notes = [] for txt in txt_files: note_tmp = Note() note_tmp.read(txt) notes.append(note_tmp) # List of all data X = [] Y = [] for n in notes: # Data points x = [it for it in zip(n.sid_list(), n.text_list())] X += x # Labels y = [it for it in n.label_list()] Y += y # Limit length X = X[:length] Y = Y[:length] # Build confusion matrix confusion = [[0 for i in labels_map] for j in labels_map] # Instantiate feat obj once (it'd really slow down CV to rebuild every time) feat_obj = FeaturesWrapper() # Extract features once feats = train.extract_features(X, feat_obj) data = zip(feats, Y) # For each held-out test set i = 1 for training, testing in cv_partitions(data[:length], num_folds=num_folds, shuffle=args.random): # Users like to see progress print 'Fold: %d of %d' % (i, num_folds) i += 1 # Train on non-heldout data X_train = [d[0] for d in training] Y_train = [d[1] for d in training] vec, clf = train.train_vectorized(X_train, Y_train, model_path=None, grid=False) # Predict on held out X_test = [d[0] for d in testing] Y_test = [d[1] for d in testing] labels = predict.predict_vectorized(X_test, clf, vec) # Compute confusion matrix for held_out data testing_confusion = evaluate.create_confusion(labels, Y_test) confusion = add_matrix(confusion, testing_confusion) # Evaluate evaluate.display_confusion(confusion)
def extract_features(X, feat_obj=None): # Data -> features if feat_obj == None: feat_obj = FeaturesWrapper() feats = feat_obj.extract_features(X) return feats