示例#1
0
class StatementClassifier(object):
  def __init__(self, train=False, cv=True, folds=5, trained_model_name="trained_model.pkl", feat_index_name="feature_index.pkl", stored_tagset="tagset.pkl"):
    self.trained_model_name = trained_model_name
    self.feat_index_name = feat_index_name
    self.stored_tagset = stored_tagset
    self.cv = cv
    self.folds = folds
    self.fp = FeatureProcessing()
    if train:
      print >>sys.stderr, "Statement classifier initialized for training."
      if self.cv:
        print >>sys.stderr, "Cross-validation will be done"
    else:
      self.classifier = pickle.load(open(self.trained_model_name, "rb"))
      feat_index = pickle.load(open(self.feat_index_name, "rb"))
      self.tagset = pickle.load(open(self.stored_tagset, "rb"))
      self.fp.feat_index = feat_index
      print >>sys.stderr, "Stored model loaded. Statement classifier initialized for prediction."

  def predict(self, testfile_name):
    test_data = [x.strip() for x in codecs.open(testfile_name, "r", "utf-8")]
    filter_feature = get_filter()
    if len(test_data) == 0:
      return []
    X = numpy.asarray([self.fp.featurize(clause, filter_feature) for clause in test_data])
    predictions = [self.tagset[ind] for ind in self.classifier.predict(X)]
    return zip(predictions, test_data)

  def train(self, trainfile_name):
    print >>sys.stderr, "Reading data.."
    train_data = [tuple(x.strip().split("\t")) for x in codecs.open(trainfile_name, "r", "utf-8")]
    shuffle(train_data)
    filter_feature = get_filter()
    train_labels, train_clauses = zip(*train_data)
    train_labels = [tl.lower() for tl in train_labels]
    print >>sys.stderr, "Indexing features.."
    self.fp.index_data(train_clauses, filter_feature)
    X = numpy.asarray([self.fp.featurize(clause, filter_feature) for clause in train_clauses])
    tagset = list(set(train_labels))
    tag_index = {l:i for (i, l) in enumerate(tagset)}
    Y = numpy.asarray([[tag_index[label]] for label in train_labels])

    classifier = OneVsRestClassifier(SVC(kernel='linear'))
    if self.cv:
      print >>sys.stderr, "Starting Cross-validation for %d folds.."%(self.folds)
      y = [l[0] for l in Y]
      scores = cross_validation.cross_val_score(classifier, X, y, cv=self.folds, scoring='f1_weighted')
      print >>sys.stderr, "Scores:", scores
      print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(scores.mean(), scores.std() * 2)

    print >>sys.stderr, "Starting training.."
    classifier.fit(X, Y)
    pickle.dump(classifier, open(self.trained_model_name, "wb"))
    pickle.dump(self.fp.feat_index, open(self.feat_index_name, "wb"))
    pickle.dump(tagset, open(self.stored_tagset, "wb"))

    print >>sys.stderr, "Done"
示例#2
0
 def __init__(self, train=False, cv=True, folds=5, trained_model_name="trained_model.pkl", feat_index_name="feature_index.pkl", stored_tagset="tagset.pkl"):
   self.trained_model_name = trained_model_name
   self.feat_index_name = feat_index_name
   self.stored_tagset = stored_tagset
   self.cv = cv
   self.folds = folds
   self.fp = FeatureProcessing()
   if train:
     print >>sys.stderr, "Statement classifier initialized for training."
     if self.cv:
       print >>sys.stderr, "Cross-validation will be done"
   else:
     self.classifier = pickle.load(open(self.trained_model_name, "rb"))
     feat_index = pickle.load(open(self.feat_index_name, "rb"))
     self.tagset = pickle.load(open(self.stored_tagset, "rb"))
     self.fp.feat_index = feat_index
     print >>sys.stderr, "Stored model loaded. Statement classifier initialized for prediction."
示例#3
0
 def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"):
   self.trained_model_name = trained_model_name
   self.fp = FeatureProcessing()
   self.do_train = do_train
   self.algorithm = algorithm
   if algorithm == "crf":
     if do_train:
       self.trainer = Trainer()
     else:
       self.tagger = Tagger()
   else:
     if do_train:
       model = ChainCRF()
       self.trainer = FrankWolfeSSVM(model=model)
       self.feat_index = {}
       self.label_index = {}
     else:
       self.tagger = pickle.load(open(self.trained_model_name, "rb"))
       self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb"))
       label_index = pickle.load(open("ssvm_label_index.pkl", "rb"))
       self.rev_label_index = {i: x for x, i in label_index.items()}
示例#4
0
class PassageTagger(object):
  def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"):
    self.trained_model_name = trained_model_name
    self.fp = FeatureProcessing()
    self.do_train = do_train
    self.algorithm = algorithm
    if algorithm == "crf":
      if do_train:
        self.trainer = Trainer()
      else:
        self.tagger = Tagger()
    else:
      if do_train:
        model = ChainCRF()
        self.trainer = FrankWolfeSSVM(model=model)
        self.feat_index = {}
        self.label_index = {}
      else:
        self.tagger = pickle.load(open(self.trained_model_name, "rb"))
        self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb"))
        label_index = pickle.load(open("ssvm_label_index.pkl", "rb"))
        self.rev_label_index = {i: x for x, i in label_index.items()}

  def read_input(self, filename):
    str_seqs = []
    str_seq = []
    feat_seqs = []
    feat_seq = []
    label_seqs = []
    label_seq = []
    for line in codecs.open(filename, "r", "utf-8"):
      lnstrp = line.strip()
      if lnstrp == "":
        if len(str_seq) != 0:
          str_seqs.append(str_seq)
          str_seq = []
          feat_seqs.append(feat_seq)
          feat_seq = []
          label_seqs.append(label_seq)
          label_seq = []
      else:
        if self.do_train:
          clause, label = lnstrp.split("\t")
          label_seq.append(label)
        else:
          clause = lnstrp
        str_seq.append(clause)
        feats = self.fp.get_features(clause)
        feat_dict = {}
        for f in feats:
          if f in feat_dict:
            feat_dict[f] += 1
          else:
            feat_dict[f] = 1
        #feat_dict = {i: v for i, v in enumerate(feats)}
        feat_seq.append(feat_dict)
    if len(str_seq) != 0:
      str_seqs.append(str_seq)
      str_seq = []
      feat_seqs.append(feat_seq)
      feat_seq = []
      label_seqs.append(label_seq)
      label_seq = []
    return str_seqs, feat_seqs, label_seqs

  def predict(self, feat_seqs):
    print >>sys.stderr, "Tagging %d sequences"%len(feat_seqs)
    if self.algorithm == "crf":
      self.tagger.open(self.trained_model_name)
      preds = [self.tagger.tag(ItemSequence(feat_seq)) for feat_seq in feat_seqs]
    else:
      Xs = []
      for fs in feat_seqs:
        X = []
        for feat_dict in fs:
          x = [0] * len(self.feat_index)
          for f in feat_dict:
            if f in self.feat_index:
              x[self.feat_index[f]] = feat_dict[f]
          X.append(x)
        Xs.append(numpy.asarray(X))
      pred_ind_seqs = self.tagger.predict(Xs)
      preds = []
      for ps in pred_ind_seqs:
        pred = []
        for pred_ind in ps:
          pred.append(self.rev_label_index[pred_ind])
        preds.append(pred)
    return preds

  def train(self, feat_seqs, label_seqs):
    print >>sys.stderr, "Training on %d sequences"%len(feat_seqs)
    if self.algorithm == "crf":
      for feat_seq, label_seq in zip(feat_seqs, label_seqs):
        self.trainer.append(ItemSequence(feat_seq), label_seq)
      self.trainer.train(self.trained_model_name)
    else:
      for fs in feat_seqs:
        for feat_dict in fs:
          for f in feat_dict:
            if f not in self.feat_index:
              self.feat_index[f] = len(self.feat_index)
      Xs = []
      for fs in feat_seqs:
        X = []
        for feat_dict in fs:
          x = [0] * len(self.feat_index)
          for f in feat_dict:
            x[self.feat_index[f]] = feat_dict[f]
          X.append(x)
        Xs.append(numpy.asarray(X))

      for ls in label_seqs:
        for label in ls:
          if label not in self.label_index:
            self.label_index[label] = len(self.label_index)

      Ys = []
      for ls in label_seqs:
        Y = []
        for label in ls:
          Y.append(self.label_index[label])
        Ys.append(numpy.asarray(Y))

      self.trainer.fit(Xs, Ys)
      pickle.dump(self.trainer, open(self.trained_model_name, "wb"))
      pickle.dump(self.feat_index, open("ssvm_feat_index.pkl", "wb"))
      pickle.dump(self.label_index, open("ssvm_label_index.pkl", "wb"))
示例#5
0
import sys
import codecs
import pickle
from features import FeatureProcessing

trainfile_name = sys.argv[1]
outfile_name = sys.argv[2]
fp = FeatureProcessing()
train_data = [
    tuple(x.strip().split("\t"))
    for x in codecs.open(trainfile_name, "r", "utf-8")
]
train_labels, train_clauses = zip(*train_data)
print >> sys.stderr, "Indexing features.."
fp.index_data(train_clauses)
feats = [fp.featurize(clause) for clause in train_clauses]

pickle.dump((feats, train_labels, fp.feat_index), open(outfile_name, "wb"))
示例#6
0
import sys
import codecs
import pickle
from features import FeatureProcessing

trainfile_name = sys.argv[1]
outfile_name = sys.argv[2]
fp = FeatureProcessing()
train_data = [tuple(x.strip().split("\t")) for x in codecs.open(trainfile_name, "r", "utf-8")]
train_labels, train_clauses = zip(*train_data)
print >>sys.stderr, "Indexing features.."
fp.index_data(train_clauses)
feats = [fp.featurize(clause) for clause in train_clauses]

pickle.dump((feats, train_labels, fp.feat_index), open(outfile_name, "wb"))