Пример #1
0
 def cm_ebmcat(self):
   path = os.path.join(config.getpath('corpora','alta2012-ebm'),'GS', 'gs1.txt')
   cm = {}
   with open(path) as f:
     reader = csv.reader(f, delimiter='\t')
     for row in reader:
       key = '{0}-{1}'.format(row[0], row[1])
       value = row[2].split(',')
       cm[key] = value
   return cm
Пример #2
0
  def _parse_data(self):
    if self._data is None:
      ts = {}
      with open(os.path.join(config.getpath('corpora','alta2012-ebm'),'test.csv')) as f:
        reader = csv.DictReader(f)
        for row in reader:
          docid = row['Document'] + '-' + row['Sentence']
          ts[docid] = row['Text']

      self._data = {}
      self._data['ts'] = dict(ts)
Пример #3
0
  def _parse_data(self):
    if self._data is None:
      cm = defaultdict(list)
      ts = {}
      with open(os.path.join(config.getpath('corpora','alta2012-ebm'),'train.csv')) as f:
        reader = csv.DictReader(f)
        for row in reader:
          docid = row['Document'] + '-' + row['Sentence']
          if row['Prediction'] == '1':
            cm[docid].append(row['Label'])
          ts[docid] = row['Text']

      self._data = {}
      self._data['cm'] = dict(cm)
      self._data['ts'] = dict(ts)
Пример #4
0
 def sp_crossvalidation(self):
   sq_index = dict( (s[0].split('-')[0],s) for s in self.sequence('abstract'))
   sp = {}
   with open(os.path.join(config.getpath('corpora','alta2012-ebm'),'data.testset')) as f:
     for i,row in enumerate(f):
       key = 'fold{0}'.format(i)
       value = []
       for abs_id in row.split('\t',1)[1].split(':'):
         if abs_id.strip():
           try:
             value.extend(sq_index[abs_id])
           except KeyError:
             # No sequence, single-sentence abstract
             value.append('{0}-1'.format(abs_id))
       sp[key] = value
   return sp
Пример #5
0
    def __unpack(self):
        if self.model is None:
            with Timer() as t:
                if self.path is None:
                    logger.info("unpacking default model")
                    model = PIBOSOModel(*load_default_model())
                else:
                    logger.info("unpacking model from: {}".format(self.path))
                    model = PIBOSOModel(*load_model(self.path))

            logger.info("unpacking took {0:.2f}s".format(t.elapsed))

            # hydrat hardcodes the paths for the classifier, which need to be updated
            # if they are installed at a different location
            classifier = config.getpath("tools", "liblinearclassifier")
            # TODO: Check that the tool exists
            if model.L1_cl.classifier != classifier:
                logger.debug("updating classifier path from {0} to {1}".format(model.L1_cl.classifier, classifier))
                model.L1_cl.classifier = classifier
                for c in model.L0_cl:
                    c.classifier = classifier
            self.model = model
        else:
            logger.debug("already unpacked!")