Exemplo n.º 1
0
  def read_problem(self, sent_file=None, gold_file=None, feat_dir=None):
    # read sents
    if not sent_file:
      sent_file = 'dat/%s/sents/%s.aligned' % (self.task, self.id)
      gold_file = 'dat/%s/gold/%s.aligned' % (self.task, self.id)
    with open(sent_file, 'r') as fin:
      self.sent = fin.readline().strip().split()
      self.N = len(self.sent)
      fin.readline()
      self.deps = [int(x) for x in fin.readline().strip().split()]
    with open(gold_file, 'r') as fin:
      self.gold_sent = fin.readline().strip().split()
      fin.readline()
      self.gold_deps = [int(x) for x in fin.readline().strip().split()]

    # compression rate
    self.compression = len(self.gold_sent) / float(self.N)

    # read feats
    # ori and ref feats
    for dir_ in ['ori', 'ref']:
      if not feat_dir:
        ffiles = ['dat/%s/feats/%s/%s/%s.bin' % (self.task, dir_, ftype, self.id) for ftype in ['uni', 'bi', 'dep']]
      else:
        ffiles = ['%s/%s/%s/%s.bin' % (feat_dir, dir_, ftype, self.id) for ftype in ['uni', 'bi', 'dep']]

      # Use different Feature object for sent and gold sent
      # because load_feat needs to know the input dimension
      if dir_ == 'ori':
        ori_feature = Feature(self.sent, None, None, None)
        self.feats_ind = ori_feature.ind
        feature = ori_feature
      else:
        # TODO: this constructor is computing more info than necessary to load feature
        ref_feature = StructFeature(self.gold_sent, None, self.gold_deps, None, ori_feature)
        feature = ref_feature

      # change array feats to mydefaultdict feats
      feat_dicts = [[] for _ in range(FType.SIZE)]
      for ftype, ffile in zip(range(FType.SIZE), ffiles):
        # feats is an array of vectors
        feats = feature.load_feat(ftype, ffile)
        for feat in feats:
          d = mydefaultdict(mydouble)
          for f in feat:
            d[f] = 1
          feat_dicts[ftype].append(d)

      if dir_ == 'ori':
        # feature factory
        self.feats = feat_dicts
      else:
        # sum of all features in the gold sentence
        self.gold_feats = mydefaultdict(mydouble)
        # uni, bi, dep
        for feats in feat_dicts:
          for feat in feats:
            self.gold_feats.iadd(feat)
Exemplo n.º 2
0
  def get_curr_feature(self):
    feats = mydefaultdict(mydouble)

    # unigram
    feats_factory = self.feats[FType.UNI]
    for pos, idx in self.feats_ind[FType.UNI].items():
      i, = pos
      var = self.model.getVarByName('u_%d' % (i+1))
      if abs(var.x - 1.0) < 1e-5:
        feats.iadd(feats_factory[idx])

    # bigram
    feats_factory = self.feats[FType.BI]
    for pos, idx in self.feats_ind[FType.BI].items():
      i, j = pos
      var = self.model.getVarByName('b_%d_%d' % (i+1, j+1))
      if abs(var.x - 1.0) < 1e-5:
        feats.iadd(feats_factory[idx])

    # edge
    feats_factory = self.feats[FType.DEP]
    for pos, idx in self.feats_ind[FType.DEP].items():
      h, m = pos
      var = self.model.getVarByName('e_%d_%d' % (h+1, m+1))
      if abs(var.x - 1.0) < 1e-5:
        feats.iadd(feats_factory[idx])

    return feats
Exemplo n.º 3
0
 def __init__(self, feat_vector, costs=None):
     self.featureVector = mydefaultdict(mydouble)
     for key, val in feat_vector.items():
         self.featureVector[key] = val
     self.costs = costs
     if self.costs != None:
         self._normalize_costs()
Exemplo n.º 4
0
 def __init__(self, feat_vector, costs=None):
     self.featureVector = mydefaultdict(mydouble)
     for key, val in feat_vector.items():
         self.featureVector[key] = val
     self.costs = costs
     if self.costs != None:
         self._normalize_costs()
Exemplo n.º 5
0
 def read_weights(self, filepath):
   w = mydefaultdict(mydouble)
   with open(filepath, 'rb') as fin:
     for line in fin:
       ss = line.strip().split('\t')
       w[ss[0]] = float(ss[1])
   return w
Exemplo n.º 6
0
  def get_curr_feature(self):
    feats = mydefaultdict(mydouble)
    # bigrams
    for idx, feat in self.bigram_feats.items():
      var = self.model.getVarByName('b_%d' % idx)
      if abs(var.x - 1.0) < 1e-5:
        feats.iadd(feat)
    #self.selected_bigrams = bigrams
    #self.selected_bigrams_feat = bigrams_feat

    # average
    # TODO: check average
    #self._average_feat(bigrams_feat, len(bigrams))

    # edges
    for (sent_id, hid, mid), feat in self.edge_feats.items():
      var = self.model.getVarByName('e_%d_%d_%d' % (sent_id, hid, mid))
      if abs(var.x - 1.0) < 1e-5:
        feats.iadd(feat)
    #self.cut_edges = edges
    #self.cut_edges_feat = edges_feat

    # average
    #self._average_feat(edges_feat, len(edges))

    return feats
Exemplo n.º 7
0
 def get_curr_feature(self):
   # bigrams
   feats = mydefaultdict(mydouble)
   for idx, feat in self.bigram_feats.items():
     var = self.model.getVarByName('b_%d' % idx)
     if abs(var.x - 1.0) < 1e-5:
       feats.iadd(feat)
   return feats
Exemplo n.º 8
0
 def _get_feature(self, string):
   """
   get feature vector from ':' separated feature string
   """
   feat = mydefaultdict(mydouble)
   for name, value in [x.split(':') for x in string.split(' ')]:
     feat[name] = float(value)
   return feat
Exemplo n.º 9
0
    def removeHapaxLegomena(instances):
        print "Counting features"
        feature2counts = mydefaultdict(mydouble)
        for instance in instances:
            for element in instance.featureVector:
                feature2counts[element] += 1

        print "Removing hapax legomena"
        newInstances = []
        for instance in instances:
            newFeatureVector = mydefaultdict(mydouble)
            for element in instance.featureVector:
                # if this feature was encountered more than once
                if feature2counts[element] > 1:
                    newFeatureVector[element] = instance.featureVector[element]
            newInstances.append(Instance(newFeatureVector, instance.costs))
        return newInstances
Exemplo n.º 10
0
    def removeHapaxLegomena(instances):
        print "Counting features"
        feature2counts = mydefaultdict(mydouble)
        for instance in instances:
            for element in instance.featureVector:
                feature2counts[element] += 1

        print "Removing hapax legomena"
        newInstances = []
        for instance in instances:
            newFeatureVector = mydefaultdict(mydouble)
            for element in instance.featureVector:
                # if this feature was encountered more than once
                if feature2counts[element] > 1:
                    newFeatureVector[element] = instance.featureVector[element]
            newInstances.append(Instance(newFeatureVector, instance.costs))
        return newInstances
Exemplo n.º 11
0
    def resorted(self):
        new = WVector()
        for action, feats in self.iteritems():
            new[action] = mydefaultdict(WVector.value_class,
                                        sorted(feats.items()))

        del self
        return new
Exemplo n.º 12
0
    def __init__(self, value_class=None):
        if value_class is None:
            value_class = WVector.value_class

        # can add new actions on the fly; doesn't need to specify list of actions a priori
        # TODO: lambda : mydefaultdict(value_class)
        defaultdict.__init__(
            self, mydefaultdict,
            [(action, mydefaultdict(value_class))
             for action in WVector.action_names])  # doublehash 1
Exemplo n.º 13
0
def test_instance_from_list(featureList):
    featureVector = mydefaultdict(mydouble)

    for featureID,featureVal in enumerate(featureList):
        # This makes word features sparse
        if featureVal!=0:
            featureVector[featureID] = featureVal

    # print "Test feature vector is",featureVector
    return Instance(featureVector)
Exemplo n.º 14
0
 def _initialize_vectors(self, instances, averaging, rounds, adapt):
     """
     Initialize the weight vectors in the beginning of training.
     We have one variance and one weight vector per class.
     """
     self.currentWeightVectors = {} 
     if adapt:
         self.currentVarianceVectors = {}
     if averaging:
         averagedWeightVectors = {}
         updatesLeft = rounds * len(instances)
     for label in instances[0].costs:
         self.currentWeightVectors[label] = mydefaultdict(mydouble)
         # remember: this is sparse in the sense that everething that doesn't have a value is 1
         # everytime we to do something with it, remember to add 1
         if adapt:
             self.currentVarianceVectors[label] = {}
         # keep the averaged weight vector
         if averaging:
             averagedWeightVectors[label] = mydefaultdict(mydouble)
     return averagedWeightVectors, updatesLeft
Exemplo n.º 15
0
def train_instance_from_list(costDict, featureList):
    """
    Generate an Instance from a set of training instances with costs. The input is a list of features e.g. words, vectorized. Note, the vectorizer is the only way of mapping each word to a unique ID.
    """
    featureVector = mydefaultdict(mydouble)

    for featureID,featureVal in enumerate(featureList):
        # This makes word features sparse
        if featureVal!=0:
            featureVector[featureID] = featureVal

    return Instance(featureVector, costDict)
Exemplo n.º 16
0
 def _initialize_vectors(self, instances, averaging, rounds, adapt):
     """
     Initialize the weight vectors in the beginning of training.
     We have one variance and one weight vector per class.
     """
     self.currentWeightVectors = {} 
     if adapt:
         self.currentVarianceVectors = {}
     if averaging:
         averagedWeightVectors = {}
         updatesLeft = rounds * len(instances)
     for label in instances[0].costs:
         self.currentWeightVectors[label] = mydefaultdict(mydouble)
         # remember: this is sparse in the sense that everething that doesn't have a value is 1
         # everytime we to do something with it, remember to add 1
         if adapt:
             self.currentVarianceVectors[label] = {}
         # keep the averaged weight vector
         if averaging:
             averagedWeightVectors[label] = mydefaultdict(mydouble)
     return averagedWeightVectors, updatesLeft
Exemplo n.º 17
0
 def removeHapaxLegomena(instances):
     """
     Hapax Legomena are features that appear only once in the whole
     dataset. This static method remove these features from the
     dataset.
     """
     print "Counting features"
     feature2counts = mydefaultdict(mydouble)
     for instance in instances:
         for element in instance.featureVector:
             feature2counts[element] += 1
     print len(feature2counts)
     print "Removing hapax legomena"
     newInstances = []
     for instance in instances:
         newFeatureVector = mydefaultdict(mydouble)
         for element in instance.featureVector:
             # if this feature was encountered more than once
             if feature2counts[element] > 1:
                 newFeatureVector[element] = instance.featureVector[element]
         newInstances.append(Instance(newFeatureVector, instance.costs))
     return newInstances
Exemplo n.º 18
0
 def removeHapaxLegomena(instances):
     """
     Hapax Legomena are features that appear only once in the whole
     dataset. This static method remove these features from the
     dataset.
     """
     print "Counting features"
     feature2counts = mydefaultdict(mydouble)
     for instance in instances:
         for element in instance.featureVector:
             feature2counts[element] += 1
     print len(feature2counts)
     print "Removing hapax legomena"
     newInstances = []
     for instance in instances:
         newFeatureVector = mydefaultdict(mydouble)
         for element in instance.featureVector:
             # if this feature was encountered more than once
             if feature2counts[element] > 1:
                 newFeatureVector[element] = instance.featureVector[element]
         newInstances.append(Instance(newFeatureVector, instance.costs))
     return newInstances
Exemplo n.º 19
0
    def train(self, instances, averaging=True, shuffling=True, rounds=10, param=1, adapt=True):
        """
        Train the classifier. If adapt is False then we have PA-II with
        prediction-based updates. If adapt is True then we have AROW.
        The param value is only used in AROW, not in PA-II.
        """
        # This is a bit nasty, averagedWeightVectors will be None if
        # averaging is False. Setting it as an instance attribute
        # might be better.
        averagedWeightVectors, updatesLeft = self._initialize_vectors(instances, averaging, rounds, adapt)

        for r in xrange(rounds):
            if shuffling:
                random.shuffle(instances)
            errorsInRound = 0
            costInRound = 0
            for instance in instances:
                prediction = self.predict(instance)
                # so if the prediction was incorrect
                # we are no longer large margin, since we are using the loss from the cost-sensitive PA
                # print "Instance costs are",instance.costs
                # print "Prediction label is",prediction.label
                if instance.costs[prediction.label] > 0:
                    errorsInRound += 1
                    costInRound += instance.costs[prediction.label]
                    self._update_parameters(instance, prediction, averaging, adapt, param,
                                            averagedWeightVectors, updatesLeft)
                if averaging:
                    updatesLeft-=1
            print "Training error rate in round " + str(r) + " : " + str(float(errorsInRound) / len(instances))
	    
        if averaging:
            for label in self.currentWeightVectors:
                self.currentWeightVectors[label] = mydefaultdict(mydouble)
                self.currentWeightVectors[label].iaddc(averagedWeightVectors[label], 1.0/float(rounds*len(instances)))

        # Compute the final training error:
        finalTrainingErrors = 0
        finalTrainingCost = 0
        for instance in instances:
            prediction = self.predict(instance)
            if instance.costs[prediction.label] > 0:
                finalTrainingErrors +=1
                finalTrainingCost += instance.costs[prediction.label]

        finalTrainingErrorRate = float(finalTrainingErrors)/len(instances)
        print "Final training error rate=" + str(finalTrainingErrorRate)
        print "Final training cost=" + str(finalTrainingCost)

        return finalTrainingCost
Exemplo n.º 20
0
    def load(self, filename):
        model_weights = open(filename, 'r')
        weightVectors = pickle.load(model_weights)
        model_weights.close()
        for label, weightVector in weightVectors.items():
            self.currentWeightVectors[label] = mydefaultdict(mydouble, weightVector)

        try:
            with gzip.open(filename + "_probVectors.gz", "rb") as probFile:
                print "loading probabilities"
                pickleDictProbVectors = pickle.load(probFile)
                self.probWeightVectors = []
                for sample in pickleDictProbVectors:
                    label2Vectors = {}
                    for label,vector in sample.items():
                        label2Vectors[label] = mydefaultdict(mydouble, vector)
                    self.probWeightVectors.append(label2Vectors)

                probFile.close()
                self.probabilities = True
        except IOError:
            print 'No weight vectors for probability estimates'
            self.probabilities = False
Exemplo n.º 21
0
    def load(self, filename):
        model_weights = open(filename, 'r')
        weightVectors = pickle.load(model_weights)
        model_weights.close()
        for label, weightVector in weightVectors.items():
            self.currentWeightVectors[label] = mydefaultdict(mydouble, weightVector)

        try:
            with gzip.open(filename + "_probVectors.gz", "rb") as probFile:
                print "loading probabilities"
                pickleDictProbVectors = pickle.load(probFile)
                self.probWeightVectors = []
                for sample in pickleDictProbVectors:
                    label2Vectors = {}
                    for label,vector in sample.items():
                        label2Vectors[label] = mydefaultdict(mydouble, vector)
                    self.probWeightVectors.append(label2Vectors)

                probFile.close()
                self.probabilities = True
        except IOError:
            print 'No weight vectors for probability estimates'
            self.probabilities = False
Exemplo n.º 22
0
  def read_problem(self):
    sent_file = 'dat/%s/sents/%s.aligned' % (self.task, self.id)
    bigram_feature_file = 'dat/%s/features/%s.bigram.feat' % (self.task, self.id)
    bigram_pos_file = 'dat/%s/features/%s.bigram.pos' % (self.task, self.id)
    gold_bigram_feature_file = 'dat/%s/solutions/maxrouge/%s.bigram' % (self.task, self.id)

    self.sents = []

    # read sentences and deps
    with open(sent_file, 'r') as fin:
      while True:
        line = fin.readline()
        if line == '':
          break
        self.sents.append(line.strip().split('\t'))
        fin.readline()  # toks
        fin.readline()  # stems
        fin.readline()  # tags
        fin.readline()  # deps
        fin.readline()  # labels
        fin.readline()  # empty line

    # read bigram features
    # build bigram dict
    # bigram_feats[bigram_id] = hvector feat
    with open(bigram_feature_file, 'r') as fin:
      self.bigram_feats = {}
      self.bigrams = {}
      for i, line in enumerate(fin):
        ss = line.strip().split('\t')
        self.bigrams[ss[0]] = i
        self.bigram_feats[i] = self._get_feature(ss[1])
      self.bigram_ids = {idx: bigram for bigram, idx in self.bigrams.items()}

    # read bigram position
    with open(bigram_pos_file, 'r') as fin:
      self.bigram_pos = {}
      for line in fin:
        ss = line.strip().split('\t')
        self.bigram_pos[self.bigrams[ss[0]]] = [tuple([int(x) for x in pos.split('_')]) for pos in ss[1].split(' ')]

    if self.train:
      # read gold bigrams and features
      with open(gold_bigram_feature_file, 'r') as fin:
        self.gold_bigrams = []
        self.gold_feats = mydefaultdict(mydouble)
        for s in fin:
          bigram_id = self.bigrams[s.strip()]
          self.gold_bigrams.append(bigram_id)
          self.gold_feats.iadd(self.bigram_feats[bigram_id])
Exemplo n.º 23
0
    def train(self, instances, averaging=True, shuffling=True, rounds=10, param=1, adapt=True):
        """
        Train the classifier. If adapt is False then we have PA-II with
        prediction-based updates. If adapt is True then we have AROW.
        The param value is only used in AROW, not in PA-II.
        """
        # This is a bit nasty, averagedWeightVectors will be None if
        # averaging is False. Setting it as an instance attribute
        # might be better.
        averagedWeightVectors, updatesLeft = self._initialize_vectors(instances, averaging, rounds, adapt)

        for r in xrange(rounds):
            if shuffling:
                random.shuffle(instances)
            errorsInRound = 0
            costInRound = 0
            for instance in instances:
                prediction = self.predict(instance)
                # so if the prediction was incorrect
                # we are no longer large margin, since we are using the loss from the cost-sensitive PA
                if instance.costs[prediction.label] > 0:
                    errorsInRound += 1
                    costInRound += instance.costs[prediction.label]
                    self._update_parameters(instance, prediction, averaging, adapt, param,
                                            averagedWeightVectors, updatesLeft)
                if averaging:
                    updatesLeft-=1
            print "Training error rate in round " + str(r) + " : " + str(float(errorsInRound) / len(instances))
	    
        if averaging:
            for label in self.currentWeightVectors:
                self.currentWeightVectors[label] = mydefaultdict(mydouble)
                self.currentWeightVectors[label].iaddc(averagedWeightVectors[label], 1.0/float(rounds*len(instances)))

        # Compute the final training error:
        finalTrainingErrors = 0
        finalTrainingCost = 0
        for instance in instances:
            prediction = self.predict(instance)
            if instance.costs[prediction.label] > 0:
                finalTrainingErrors +=1
                finalTrainingCost += instance.costs[prediction.label]

        finalTrainingErrorRate = float(finalTrainingErrors)/len(instances)
        print "Final training error rate=" + str(finalTrainingErrorRate)
        print "Final training cost=" + str(finalTrainingCost)

        return finalTrainingCost
Exemplo n.º 24
0
def instance_from_svm_input(svm_input):
    """
    Generate an Instance from a SVMLight input.
    """
    feat_vec = mydefaultdict(mydouble)
    costs = {}
    splitted = svm_input.split()
    if splitted[0] == "-1":
        costs["neg"] = 0
        costs["pos"] = 1
    elif splitted[0] == "+1":
        costs["neg"] = 1
        costs["pos"] = 0
    for elem in splitted[1:]:
        fid, val = elem.split(':')
        feat_vec[fid] = float(val)
    return Instance(feat_vec, costs)
Exemplo n.º 25
0
def train_instance_from_svm_input(line):
    """
    Generate an Instance from a set of training instances with costs
    """
    details = line.split("|")
    costs = {}
    featureVector = mydefaultdict(mydouble)
    costDict = details[0].split()
    featureDict = details[1].split()
    for pair in costDict:
        label, cost = pair.split(":")
        # Account for infinity
        # if cost is not "inf" else float(1e10)
        costs[label] = float(cost)
    for featureID,featureVal in enumerate(featureDict):
        featureVector[featureID] = float(id(intern(featureVal)))

    return Instance(featureVector, costs)
Exemplo n.º 26
0
    def probGeneration(self, scale=1.0, noWeightVectors=100):
        # initialize the weight vectors
        print "Generating samples for the weight vectors to obtain probability estimates"
        self.probWeightVectors = []
        for i in xrange(noWeightVectors):
            self.probWeightVectors.append({})
            for label in self.currentWeightVectors:
                self.probWeightVectors[i][label] = mydefaultdict(mydouble)

        for label in self.currentWeightVectors:
            # We are ignoring features that never got their weight set 
            for feature in self.currentWeightVectors[label]:
                # note that if the weight was updated, then the variance must have been updated too, i.e. we shouldn't have 0s
                weights = numpy.random.normal(self.currentWeightVectors[label][feature], scale * self.currentVarianceVectors[label][feature], noWeightVectors)
                # we got the samples, now let's put them in the right places
                for i,weight in enumerate(weights):
                    self.probWeightVectors[i][label][feature] = weight
                
        print "done"
        self.probabilities = True
Exemplo n.º 27
0
    def probGeneration(self, scale=1.0, noWeightVectors=100):
        # initialize the weight vectors
        print "Generating samples for the weight vectors to obtain probability estimates"
        self.probWeightVectors = []
        for i in xrange(noWeightVectors):
            self.probWeightVectors.append({})
            for label in self.currentWeightVectors:
                self.probWeightVectors[i][label] = mydefaultdict(mydouble)

        for label in self.currentWeightVectors:
            # We are ignoring features that never got their weight set 
            for feature in self.currentWeightVectors[label]:
                # note that if the weight was updated, then the variance must have been updated too, i.e. we shouldn't have 0s
                weights = numpy.random.normal(self.currentWeightVectors[label][feature], scale * self.currentVarianceVectors[label][feature], noWeightVectors)
                # we got the samples, now let's put them in the right places
                for i,weight in enumerate(weights):
                    self.probWeightVectors[i][label][feature] = weight
                
        print "done"
        self.probabilities = True
Exemplo n.º 28
0
  def get_delta_feature(self):
    """
    Get the difference between the features of the
    current structure and the gold structure
    """
    curr = self.get_curr_feature()

    delta = mydefaultdict(mydouble)
    delta.iaddc(self.gold_feats, 1)
    delta.iaddc(curr, -1)

    if debug_level > 0:
      print 'predicted feat:'
      print curr.items()[:10]

      print 'gold feat:'
      print self.gold_feats.items()[:10]

      print 'delta feat:'
      print delta.items()[:10]

    return delta
Exemplo n.º 29
0
if __name__ == "__main__":

    import sys
    import random
    random.seed(13)
    numpy.random.seed(13)
    dataLines = open(sys.argv[1]).readlines()

    instances = []
    classifier_p = AROW()
    print "Reading the data"
    for line in dataLines:
        details = line.split()
        costs = {}
        featureVector = mydefaultdict(mydouble)

        if details[0] == "-1":
            costs["neg"] = 0
            costs["pos"] = 1
        elif details[0] == "+1":
            costs["neg"] = 1
            costs["pos"] = 0

        for feature in details[1:]:
            featureID, featureVal = feature.split(":")
            featureVector[featureID] = float(featureVal)
        instances.append(Instance(featureVector, costs))
        #print instances[-1].costs

    random.shuffle(instances)
Exemplo n.º 30
0
print x

x += 5
print x

x += mydouble(2)
print x
print x.__copy__()

x = mydouble()
print x

# x += "1"
# print x

d = mydefaultdict(mydouble)  # always like that

d["a"] = 1  # no need to say mydouble(1); transparent to the user
print d

d.iadd(d)
print "d=", d

d.iaddc(d, 0.5)
print "d=", d

c = mydefaultdict(mydouble)  # always like that
print c

c += d
#c.__iadd__(d)
Exemplo n.º 31
0
  problem = SummaryProblemJointDep('tac09', 'D0901A-A')
  print problem.sents[0]
  #print problem.labels[0]
  print problem.deps[0]
  #print problem.bigrams['of control']
  #print problem.bigram_feats[0]
  #print problem.edge_feats[0,2,0]
  #print problem.bigram_pos[0]
  #print problem.gold_edges
  #print problem.gold_bigrams
  #print [problem.bigram_ids[i] for i in problem.gold_bigrams]

  #edge_weights = mydefaultdict(mydouble)
  #bigram_weights = mydefaultdict(mydouble)
  #bigram_weights['doc_ratio'] = 1.0
  weights = mydefaultdict(mydouble)
  weights['doc_ratio'] = 1.0

  start = time.clock()
  problem.build_ilp()
  print 'building ilp time:', time.clock() - start
  problem.model.write('D0901A-A.gurobi.lp')

  #start = time.clock()
  #problem.solve()
  #print 'solving ilp time:', time.clock() - start

  #bigrams, bigrams_feat, edges, edges_feat = problem.get_curr_feature()
  #print bigrams
  #print sorted(edges, key=lambda x: x[0])
  #print [problem.bigram_ids[x] for x in bigrams]
Exemplo n.º 32
0
if __name__ == "__main__":

    import sys
    import random
    random.seed(13)           
    numpy.random.seed(13)
    dataLines = open(sys.argv[1]).readlines()

    instances = []
    classifier_p = AROW()
    print "Reading the data"
    for line in dataLines:
        details = line.split()
        costs = {}
        featureVector = mydefaultdict(mydouble)
        
        if details[0] == "-1":
            costs["neg"] = 0
            costs["pos"] = 1
        elif details[0] == "+1":
            costs["neg"] = 1
            costs["pos"] = 0

        for feature in details[1:]:
            featureID, featureVal = feature.split(":")
            featureVector[featureID] = float(featureVal)
            #featureVector["dummy"+str(len(instances))] = 1.0
            #featureVector["dummy2"+str(len(instances))] = 1.0
            #featureVector["dummy3"+str(len(instances))] = 1.0
        instances.append(Instance(featureVector, costs))
Exemplo n.º 33
0
    def train(self, instances, averaging=True, shuffling=True, rounds = 10, param = 1, adapt=True):
        # we first need to go through the dataset to find how many classes

        # Initialize the weight vectors in the beginning of training"
        # we have one variance and one weight vector per class
        self.currentWeightVectors = {} 
        if adapt:
            self.currentVarianceVectors = {}
        if averaging:
            averagedWeightVectors = {}
            updatesLeft = rounds*len(instances)
        for label in instances[0].costs:
            self.currentWeightVectors[label] = mydefaultdict(mydouble)
            # remember: this is sparse in the sense that everething that doesn't have a value is 1
            # everytime we to do something with it, remember to add 1
            if adapt:
                self.currentVarianceVectors[label] = {}
            # keep the averaged weight vector
            if averaging:
                averagedWeightVectors[label] = mydefaultdict(mydouble)

        # in each iteration        
        for r in range(rounds):
            # shuffle
            if shuffling:
                random.shuffle(instances)
            errorsInRound = 0
            costInRound = 0
            # for each instance
            for instance in instances:
                prediction = self.predict(instance)

                # so if the prediction was incorrect
                # we are no longer large margin, since we are using the loss from the cost-sensitive PA
                if instance.costs[prediction.label] > 0:
                    errorsInRound += 1
                    costInRound += instance.costs[prediction.label]

                    # first we need to get the score for the correct answer
                    # if the instance has more than one correct answer then pick the min
                    minCorrectLabelScore = float("inf")
                    minCorrectLabel = None
                    for label in instance.correctLabels:
                        score = instance.featureVector.dot(self.currentWeightVectors[label])
                        if score < minCorrectLabelScore:
                            minCorrectLabelScore = score
                            minCorrectLabel = label
                            
                    # the loss is the scaled margin loss also used by Mejer and Crammer 2010
                    loss = prediction.score - minCorrectLabelScore  + math.sqrt(instance.costs[prediction.label])
                        
                    if adapt:
                        # Calculate the confidence values
                        # first for the predicted label
                        zVectorPredicted = mydefaultdict(mydouble)
                        zVectorMinCorrect = mydefaultdict(mydouble)
                        for feature in instance.featureVector:
                            # the variance is either some value that is in the dict or just 1
                            if feature in self.currentVarianceVectors[prediction.label]:
                                zVectorPredicted[feature] = instance.featureVector[feature] * self.currentVarianceVectors[prediction.label][feature]
                            else:
                                zVectorPredicted[feature] = instance.featureVector[feature]
                            # then for the minCorrect:
                            if feature in self.currentVarianceVectors[minCorrectLabel]:
                                zVectorMinCorrect[feature] = instance.featureVector[feature] * self.currentVarianceVectors[minCorrectLabel][feature]
                            else:
                                zVectorMinCorrect[feature] = instance.featureVector[feature]
                    
                        confidence = zVectorPredicted.dot(instance.featureVector) + zVectorMinCorrect.dot(instance.featureVector)

                        beta = 1.0/(confidence + param)

                        alpha = loss * beta

                        # update the current weight vectors
                        self.currentWeightVectors[prediction.label].iaddc(zVectorPredicted, -alpha)
                        self.currentWeightVectors[minCorrectLabel].iaddc(zVectorMinCorrect, alpha)

                        if averaging:
                            averagedWeightVectors[prediction.label].iaddc(zVectorPredicted, -alpha * updatesLeft)
                            averagedWeightVectors[minCorrectLabel].iaddc(zVectorMinCorrect, alpha * updatesLeft)
                        
                    else:
                        # the squared norm is twice the square of the features since they are the same per class 
                        norm = 2*(instance.featureVector.dot(instance.featureVector))
                        factor = loss/(norm + float(1)/(2*param))
                        self.currentWeightVectors[prediction.label].iaddc(instance.featureVector, -factor)
                        self.currentWeightVectors[minCorrectLabel].iaddc(instance.featureVector, factor)

                        if averaging:
                            averagedWeightVectors[prediction.label].iaddc(instance.featureVector, -factor * updatesLeft)
                            averagedWeightVectors[minCorrectLabel].iaddc(instance.featureVector, factor * updatesLeft)
                        
                    
                    if adapt:
                        # update the diagonal covariance
                        for feature in instance.featureVector.iterkeys():
                            # for the predicted
			                if feature in self.currentVarianceVectors[prediction.label]:
			                    self.currentVarianceVectors[prediction.label][feature] -= beta * pow(zVectorPredicted[feature],2)
			                else:
			                    # Never updated this covariance before, add 1
			                    self.currentVarianceVectors[prediction.label][feature] = 1 - beta * pow(zVectorPredicted[feature],2)
                            # for the minCorrect
			                if feature in self.currentVarianceVectors[minCorrectLabel]:
			                    self.currentVarianceVectors[minCorrectLabel][feature] -= beta * pow(zVectorMinCorrect[feature],2)
			                else:
			                    # Never updated this covariance before, add 1
			                    self.currentVarianceVectors[minCorrectLabel][feature] = 1 - beta * pow(zVectorMinCorrect[feature],2)

                if averaging:
		            updatesLeft-=1
                
            print "Training error rate in round " + str(r) + " : " + str(float(errorsInRound)/len(instances))
	    
        if averaging:
            for label in self.currentWeightVectors:
                self.currentWeightVectors[label] = mydefaultdict(mydouble)
                self.currentWeightVectors[label].iaddc(averagedWeightVectors[label], 1.0/float(rounds*len(instances)))

        # Compute the final training error:
        finalTrainingErrors = 0
        finalTrainingCost = 0
        for instance in instances:
            prediction = self.predict(instance)
            if instance.costs[prediction.label] > 0:
                finalTrainingErrors +=1
                finalTrainingCost += instance.costs[prediction.label]

        finalTrainingErrorRate = float(finalTrainingErrors)/len(instances)
        print "Final training error rate=" + str(finalTrainingErrorRate)
        print "Final training cost=" + str(finalTrainingCost)

        return finalTrainingCost
Exemplo n.º 34
0
#!/usr/bin/env python

from _mycollections import mydefaultdict
from mydouble import mydouble, counts

d = mydefaultdict(mydouble)  # always like that

d["a"] = 1                   # no need to say mydouble(1); transparent to the user
print d

print d.addc(d, 0.5)

for i in xrange(500000):
    d[str(i)] = 2

print len(d)

import gc
e = d.copy()
print "before", e["a"], counts()

for i in xrange(20):
#    e = e.deepcopy()
    e.iaddc(d, 0.5)
#    e.addc(d, 0.5)
    print e["a"], counts()
#    gc.collect()
#     del e
# ##    gc.collect()
#     e = f
    
Exemplo n.º 35
0
#!/usr/bin/env python

from _mycollections import mydefaultdict
from mydouble import mydouble, counts

d = mydefaultdict(mydouble)  # always like that

d["a"] = 1  # no need to say mydouble(1); transparent to the user
print d

print d.addc(d, 0.5)

for i in xrange(500000):
    d[str(i)] = 2

print len(d)

import gc
e = d.copy()
print "before", e["a"], counts()

for i in xrange(20):
    #    e = e.deepcopy()
    e.iaddc(d, 0.5)
    #    e.addc(d, 0.5)
    print e["a"], counts()
#    gc.collect()
#     del e
# ##    gc.collect()
#     e = f
Exemplo n.º 36
0
def test_instance_from_svm_input(line):
    featureVector = mydefaultdict(mydouble)
    featureDict = line.split()
    for featureID,featureVal in enumerate(featureDict):
        featureVector[featureID] = float(id(intern(featureVal)))
    return Instance(featureVector)
Exemplo n.º 37
0
  def __init__(self, train, dev, test, learning_rate=1.0, iteration=10, shuffle=False, method='extract', scratch=scratch_dir, max_data_size=10000, init_model=None):
    # model: extract/joint/jointdep
    self.method = method

    # load training data
    if train:
      # file ids
      self.train_problems = self.read_problems(train, max_data_size, True)
      log.write('Load %d training problems from %s\n' % (len(self.train_problems), train))

    # load dev data
    if dev:
      self.dev_problems = self.read_problems(dev, max_data_size, False)
      log.write('Load %d dev problems from %s\n' % (len(self.dev_problems), dev))

    # load test data
    if test:
      self.test_problems = self.read_problems(test, max_data_size, False)
      log.write('Load %d test problems from %s\n' % (len(self.test_problems), test))
    self.test_task = test

    # learning params
    self.learning_rate = learning_rate
    self.shuffle = shuffle
    self.iteration = iteration

    # intermediate result
    self.scratch = scratch

    if not init_model:
      # initial weights: zero vectors
      #self.bigram_weights = mydefaultdict(mydouble)
      #self.bigram_weights['doc_ratio'] = 1
      #self.edge_weights = mydefaultdict(mydouble)
      self.weights = mydefaultdict(mydouble)
    else:
      self.load_model(init_model)

    # initial ilp model
    t = time.clock()
    log.write('Building ILPs ... ')
    ilp_dir = '%s/ilps/%s' % (self.scratch, self.method)
    if not os.path.isdir(ilp_dir):
      os.makedirs(ilp_dir)

    if train:
      self.build_ilps(self.train_problems, self.weights, ilp_dir, maxlen[train[:3]])
    if dev:
      self.build_ilps(self.dev_problems, self.weights, ilp_dir, maxlen[dev[:3]])
    if test:
      self.build_ilps(self.test_problems, self.weights, ilp_dir, maxlen[test[:3]])
    log.write('[%.2fs]\n' % (time.clock() - t))

    # for averaged perceptron
    self.c = 1

    # evaluate on dev
    if dev:
      summary_dir = '%s/summaries/%s' % (self.scratch, dev)
      if not os.path.isdir(summary_dir):
        os.makedirs(summary_dir)

      ref_dir = 'dat/%s/models' % dev
      if self.method == 'jointdep':
        self.evaluator = EvaluatorNgram(self.dev_problems)
      else:
        self.evaluator = EvaluatorRouge(ref_dir, summary_dir)
Exemplo n.º 38
0
    def _update_parameters(self, instance, prediction, averaging, adapt, param,
                           averagedWeightVectors, updatesLeft):
        """
        Update the weights and return the total number of errors.
        """
        # first we need to get the score for the correct answer
        # if the instance has more than one correct answer then pick the min
        minCorrectLabelScore = float("inf")
        minCorrectLabel = None
        for label in instance.correctLabels:
            score = instance.featureVector.dot(self.currentWeightVectors[label])
            if score < minCorrectLabelScore:
                minCorrectLabelScore = score
                minCorrectLabel = label

        # the loss is the scaled margin loss also used by Mejer and Crammer 2010
        loss = prediction.score - minCorrectLabelScore  + math.sqrt(instance.costs[prediction.label])
        if adapt:
            # Calculate the confidence values
            # first for the predicted label
            zVectorPredicted = mydefaultdict(mydouble)
            zVectorMinCorrect = mydefaultdict(mydouble)
            for feature in instance.featureVector:
                # the variance is either some value that is in the dict or just 1
                if feature in self.currentVarianceVectors[prediction.label]:
                    zVectorPredicted[feature] = instance.featureVector[feature] * self.currentVarianceVectors[prediction.label][feature]
                else:
                    zVectorPredicted[feature] = instance.featureVector[feature]
                # then for the minCorrect:
                if feature in self.currentVarianceVectors[minCorrectLabel]:
                    zVectorMinCorrect[feature] = instance.featureVector[feature] * self.currentVarianceVectors[minCorrectLabel][feature]
                else:
                    zVectorMinCorrect[feature] = instance.featureVector[feature]
            confidence = zVectorPredicted.dot(instance.featureVector) + zVectorMinCorrect.dot(instance.featureVector)
            beta = 1.0 / (confidence + param)
            alpha = loss * beta

            # update the current weight vectors
            self.currentWeightVectors[prediction.label].iaddc(zVectorPredicted, -alpha)
            self.currentWeightVectors[minCorrectLabel].iaddc(zVectorMinCorrect, alpha)
            if averaging:
                averagedWeightVectors[prediction.label].iaddc(zVectorPredicted, -alpha * updatesLeft)
                averagedWeightVectors[minCorrectLabel].iaddc(zVectorMinCorrect, alpha * updatesLeft)
        else:
            # the squared norm is twice the square of the features since they are the same per class 
            norm = 2 * (instance.featureVector.dot(instance.featureVector))
            factor = loss / (norm + 1.0 / (2 * param))
            self.currentWeightVectors[prediction.label].iaddc(instance.featureVector, -factor)
            self.currentWeightVectors[minCorrectLabel].iaddc(instance.featureVector, factor)
            if averaging:
                averagedWeightVectors[prediction.label].iaddc(instance.featureVector, -factor * updatesLeft)
                averagedWeightVectors[minCorrectLabel].iaddc(instance.featureVector, factor * updatesLeft)
        if adapt:
            # update the diagonal covariance
            for feature in instance.featureVector.iterkeys():
                # for the predicted
                if feature in self.currentVarianceVectors[prediction.label]:
                    self.currentVarianceVectors[prediction.label][feature] -= beta * pow(zVectorPredicted[feature], 2)
                else:
                    # Never updated this covariance before, add 1
                    self.currentVarianceVectors[prediction.label][feature] = 1 - beta * pow(zVectorPredicted[feature], 2)
                # for the minCorrect
                if feature in self.currentVarianceVectors[minCorrectLabel]:
                    self.currentVarianceVectors[minCorrectLabel][feature] -= beta * pow(zVectorMinCorrect[feature], 2)
                else:
                    # Never updated this covariance before, add 1
                    self.currentVarianceVectors[minCorrectLabel][feature] = 1 - beta * pow(zVectorMinCorrect[feature], 2)
Exemplo n.º 39
0
    def train(self,
              instances,
              averaging=True,
              shuffling=True,
              rounds=10,
              param=1):
        # we first need to go through the dataset to find how many classes

        # Initialize the weight vectors in the beginning of training"
        # we have one variance and one weight vector per class
        self.currentWeightVectors = {}
        self.currentVarianceVectors = {}
        if averaging:
            averagedWeightVectors = {}
            updatesLeft = rounds * len(instances)
        for label in instances[0].costs:
            self.currentWeightVectors[label] = mydefaultdict(mydouble)
            # remember: this is sparse in the sense that everething that doesn't have a value is 1
            # everytime we to do something with it, remember to add 1
            self.currentVarianceVectors[label] = {}
            # keep the averaged weight vector
            if averaging:
                averagedWeightVectors[label] = mydefaultdict(mydouble)

        # in each iteration
        for r in range(rounds):
            # shuffle
            if shuffling:
                random.shuffle(instances)
            errorsInRound = 0
            costInRound = 0
            # for each instance
            for instance in instances:
                prediction = self.predict(instance)

                # so if the prediction was incorrect
                # we are no longer large margin, since we are using the loss from the cost-sensitive PA
                if instance.costs[prediction.label] > 0:
                    errorsInRound += 1
                    costInRound += instance.costs[prediction.label]

                    # first we need to get the score for the correct answer
                    # if the instance has more than one correct answer then pick the min
                    minCorrectLabelScore = float("inf")
                    minCorrectLabel = None
                    for label in instance.correctLabels:
                        score = instance.featureVector.dot(
                            self.currentWeightVectors[label])
                        if score < minCorrectLabelScore:
                            minCorrectLabelScore = score
                            minCorrectLabel = label

                    # Calculate the confidence values
                    # first for the predicted label
                    zVectorPredicted = mydefaultdict(mydouble)
                    zVectorMinCorrect = mydefaultdict(mydouble)
                    for feature in instance.featureVector:
                        # the variance is either some value that is in the dict or just 1
                        if feature in self.currentVarianceVectors[
                                prediction.label]:
                            zVectorPredicted[feature] = instance.featureVector[
                                feature] * self.currentVarianceVectors[
                                    prediction.label][feature]
                        else:
                            zVectorPredicted[feature] = instance.featureVector[
                                feature]
                        # then for the minCorrect:
                        if feature in self.currentVarianceVectors[
                                minCorrectLabel]:
                            zVectorMinCorrect[
                                feature] = instance.featureVector[
                                    feature] * self.currentVarianceVectors[
                                        minCorrectLabel][feature]
                        else:
                            zVectorMinCorrect[
                                feature] = instance.featureVector[feature]

                    confidence = zVectorPredicted.dot(
                        instance.featureVector) + zVectorMinCorrect.dot(
                            instance.featureVector)

                    beta = 1.0 / (confidence + param)

                    # the loss is the scaled margin loss also used by Mejer and Crammer 2010
                    loss = prediction.score - minCorrectLabelScore + math.sqrt(
                        instance.costs[prediction.label])

                    alpha = loss * beta

                    # update the current weight vectors
                    self.currentWeightVectors[prediction.label].iaddc(
                        zVectorPredicted, -alpha)
                    self.currentWeightVectors[minCorrectLabel].iaddc(
                        zVectorMinCorrect, alpha)
                    if averaging:
                        averagedWeightVectors[prediction.label].iaddc(
                            zVectorPredicted, -alpha * updatesLeft)
                        averagedWeightVectors[minCorrectLabel].iaddc(
                            zVectorMinCorrect, alpha * updatesLeft)

                    # update the diagonal covariance
                    for feature in instance.featureVector.iterkeys():
                        # for the predicted
                        if feature in self.currentVarianceVectors[
                                prediction.label]:
                            self.currentVarianceVectors[
                                prediction.label][feature] -= beta * pow(
                                    zVectorPredicted[feature], 2)
                        else:
                            # Never updated this covariance before, add 1
                            self.currentVarianceVectors[
                                prediction.label][feature] = 1 - beta * pow(
                                    zVectorPredicted[feature], 2)
                    # for the minCorrect
                        if feature in self.currentVarianceVectors[
                                minCorrectLabel]:
                            self.currentVarianceVectors[minCorrectLabel][
                                feature] -= beta * pow(
                                    zVectorMinCorrect[feature], 2)
                        else:
                            # Never updated this covariance before, add 1
                            self.currentVarianceVectors[minCorrectLabel][
                                feature] = 1 - beta * pow(
                                    zVectorMinCorrect[feature], 2)

                if averaging:
                    updatesLeft -= 1

            print "Training error rate in round " + str(r) + " : " + str(
                float(errorsInRound) / len(instances))

        if averaging:
            for label in self.currentWeightVectors:
                self.currentWeightVectors[label].iaddc(
                    averagedWeightVectors[label],
                    1.0 / float(rounds * len(instances)))

        # Compute the final training error:
        finalTrainingErrors = 0
        finalTrainingCost = 0
        for instance in instances:
            prediction = self.predict(instance)
            if instance.costs[prediction.label] > 0:
                finalTrainingErrors += 1
                finalTrainingCost += instance.costs[prediction.label]

        finalTrainingErrorRate = float(finalTrainingErrors) / len(instances)
        print "Final training error rate=" + str(finalTrainingErrorRate)
        print "Final training cost=" + str(finalTrainingCost)

        return finalTrainingCost
Exemplo n.º 40
0
    def _update_parameters(self, instance, prediction, averaging, adapt, param,
                           averagedWeightVectors, updatesLeft):
        """
        Update the weights and return the total number of errors.
        """
        # first we need to get the score for the correct answer
        # if the instance has more than one correct answer then pick the min
        minCorrectLabelScore = float("inf")
        minCorrectLabel = None
        for label in instance.correctLabels:
            score = instance.featureVector.dot(self.currentWeightVectors[label])
            if score < minCorrectLabelScore:
                minCorrectLabelScore = score
                minCorrectLabel = label

        # the loss is the scaled margin loss also used by Mejer and Crammer 2010
        loss = prediction.score - minCorrectLabelScore  + math.sqrt(instance.costs[prediction.label])
        if adapt:
            # Calculate the confidence values
            # first for the predicted label
            zVectorPredicted = mydefaultdict(mydouble)
            zVectorMinCorrect = mydefaultdict(mydouble)
            for feature in instance.featureVector:
                # the variance is either some value that is in the dict or just 1
                if feature in self.currentVarianceVectors[prediction.label]:
                    zVectorPredicted[feature] = instance.featureVector[feature] * self.currentVarianceVectors[prediction.label][feature]
                else:
                    zVectorPredicted[feature] = instance.featureVector[feature]
                # then for the minCorrect:
                if feature in self.currentVarianceVectors[minCorrectLabel]:
                    zVectorMinCorrect[feature] = instance.featureVector[feature] * self.currentVarianceVectors[minCorrectLabel][feature]
                else:
                    zVectorMinCorrect[feature] = instance.featureVector[feature]
            confidence = zVectorPredicted.dot(instance.featureVector) + zVectorMinCorrect.dot(instance.featureVector)
            beta = 1.0 / (confidence + param)
            alpha = loss * beta

            # update the current weight vectors
            self.currentWeightVectors[prediction.label].iaddc(zVectorPredicted, -alpha)
            self.currentWeightVectors[minCorrectLabel].iaddc(zVectorMinCorrect, alpha)
            if averaging:
                averagedWeightVectors[prediction.label].iaddc(zVectorPredicted, -alpha * updatesLeft)
                averagedWeightVectors[minCorrectLabel].iaddc(zVectorMinCorrect, alpha * updatesLeft)
        else:
            # the squared norm is twice the square of the features since they are the same per class 
            norm = 2 * (instance.featureVector.dot(instance.featureVector))
            factor = loss / (norm + 1.0 / (2 * param))
            self.currentWeightVectors[prediction.label].iaddc(instance.featureVector, -factor)
            self.currentWeightVectors[minCorrectLabel].iaddc(instance.featureVector, factor)
            if averaging:
                averagedWeightVectors[prediction.label].iaddc(instance.featureVector, -factor * updatesLeft)
                averagedWeightVectors[minCorrectLabel].iaddc(instance.featureVector, factor * updatesLeft)
        if adapt:
            # update the diagonal covariance
            for feature in instance.featureVector.iterkeys():
                # for the predicted
                if feature in self.currentVarianceVectors[prediction.label]:
                    self.currentVarianceVectors[prediction.label][feature] -= beta * pow(zVectorPredicted[feature], 2)
                else:
                    # Never updated this covariance before, add 1
                    self.currentVarianceVectors[prediction.label][feature] = 1 - beta * pow(zVectorPredicted[feature], 2)
                # for the minCorrect
                if feature in self.currentVarianceVectors[minCorrectLabel]:
                    self.currentVarianceVectors[minCorrectLabel][feature] -= beta * pow(zVectorMinCorrect[feature], 2)
                else:
                    # Never updated this covariance before, add 1
                    self.currentVarianceVectors[minCorrectLabel][feature] = 1 - beta * pow(zVectorMinCorrect[feature], 2)
Exemplo n.º 41
0
print x

x += 5
print x

x += mydouble(2)
print x
print x.__copy__()

x = mydouble()
print x

# x += "1"
# print x

d = mydefaultdict(mydouble)  # always like that

d["a"] = 1                   # no need to say mydouble(1); transparent to the user
print d

d.iadd(d)
print "d=", d

d.iaddc(d, 0.5)
print "d=", d

c = mydefaultdict(mydouble)  # always like that
print c

c += d
#c.__iadd__(d)
Exemplo n.º 42
0
def _unpickle_mydict(s):
    return mydefaultdict(mydouble, (take2(x) for x in s.split()))