def __init__(self,
              parts_by_doc=None,
              n_max=3,
              expand=True,
              split_tokens=None):
     """:param parts_by_doc: a dictionary d where d[document_name.upper()] = [partA, partB, ...]"""
     OmniNgrams.__init__(self, n_max=n_max, split_tokens=None)
     self.parts_by_doc = parts_by_doc
     self.expander = expand_part_range if expand else (lambda x: [x])
예제 #2
0
 def __init__(self,
              parts_by_doc=None,
              n_max=1,
              expand=False,
              split_tokens=' '):
     """:param parts_by_doc: a dictionary d where d[document_name.upper()] = [partA, partB, ...]"""
     OmniNgrams.__init__(self, n_max=n_max, split_tokens=' ')
     self.parts_by_doc = parts_by_doc
     self.expander = lambda x: [x]
 def apply(self, session, context):
     for ts in OmniNgrams.apply(self, session, context):
         if ts.get_span().endswith('.0'):
             value = ts.get_span()[:-2]
             yield TemporaryImplicitSpan(
                 sentence=ts.sentence,
                 char_start=ts.char_start,
                 char_end=ts.char_end,
                 expander_key=u'volt_expander',
                 position=0,
                 text=value,
                 words=[value],
                 lemmas=[value],
                 pos_tags=[ts.get_attrib_tokens('pos_tags')[-1]],
                 ner_tags=[ts.get_attrib_tokens('ner_tags')[-1]],
                 dep_parents=[ts.get_attrib_tokens('dep_parents')[-1]],
                 dep_labels=[ts.get_attrib_tokens('dep_labels')[-1]],
                 page=[ts.get_attrib_tokens('page')[-1]]
                 if ts.sentence.is_visual() else [None],
                 top=[ts.get_attrib_tokens('top')[-1]]
                 if ts.sentence.is_visual() else [None],
                 left=[ts.get_attrib_tokens('left')[-1]]
                 if ts.sentence.is_visual() else [None],
                 bottom=[ts.get_attrib_tokens('bottom')[-1]]
                 if ts.sentence.is_visual() else [None],
                 right=[ts.get_attrib_tokens('right')[-1]]
                 if ts.sentence.is_visual() else [None],
                 meta=None)
         else:
             yield ts
예제 #4
0
 def apply(self, session, context):
     for ts in OmniNgrams.apply(self, session, context):
         m = re.match(
             u"^([\+\-\u2010\u2011\u2012\u2013\u2014\u2212\uf02d])?(\s*)(\d+)$",
             ts.get_span(),
             re.U,
         )
         if m:
             if m.group(1) is None:
                 temp = ""
             elif m.group(1) == "+":
                 if m.group(2) != "":
                     continue  # If bigram '+ 150' is seen, accept the unigram '150', not both
                 temp = ""
             else:  # m.group(1) is a type of negative sign
                 # A bigram '- 150' is different from unigram '150', so we keep the implicit '-150'
                 temp = "-"
             temp += m.group(3)
             yield TemporaryImplicitSpan(
                 sentence=ts.sentence,
                 char_start=ts.char_start,
                 char_end=ts.char_end,
                 expander_key=u"temp_expander",
                 position=0,
                 text=temp,
                 words=[temp],
                 lemmas=[temp],
                 pos_tags=[ts.get_attrib_tokens("pos_tags")[-1]],
                 ner_tags=[ts.get_attrib_tokens("ner_tags")[-1]],
                 dep_parents=[ts.get_attrib_tokens("dep_parents")[-1]],
                 dep_labels=[ts.get_attrib_tokens("dep_labels")[-1]],
                 page=[ts.get_attrib_tokens("page")[-1]]
                 if ts.sentence.is_visual()
                 else [None],
                 top=[ts.get_attrib_tokens("top")[-1]]
                 if ts.sentence.is_visual()
                 else [None],
                 left=[ts.get_attrib_tokens("left")[-1]]
                 if ts.sentence.is_visual()
                 else [None],
                 bottom=[ts.get_attrib_tokens("bottom")[-1]]
                 if ts.sentence.is_visual()
                 else [None],
                 right=[ts.get_attrib_tokens("right")[-1]]
                 if ts.sentence.is_visual()
                 else [None],
                 meta=None,
             )
         else:
             yield ts
예제 #5
0
 def apply(self, session, context):
     for ts in OmniNgrams.apply(self, session, context):
         enumerated_parts = [
             part.upper() for part in expand_part_range(ts.get_span())
         ]
         parts = set(enumerated_parts)
         if self.parts_by_doc:
             possible_parts = self.parts_by_doc[ts.parent.document.name.upper()]
             for base_part in enumerated_parts:
                 for part in possible_parts:
                     if part.startswith(base_part) and len(base_part) >= 4:
                         parts.add(part)
         for i, part in enumerate(parts):
             if " " in part:
                 continue  # it won't pass the part_matcher
             if part == ts.get_span():
                 yield ts
             else:
                 yield TemporaryImplicitSpan(
                     sentence=ts.sentence,
                     char_start=ts.char_start,
                     char_end=ts.char_end,
                     expander_key=u"part_expander",
                     position=i,
                     text=part,
                     words=[part],
                     lemmas=[part],
                     pos_tags=[ts.get_attrib_tokens("pos_tags")[0]],
                     ner_tags=[ts.get_attrib_tokens("ner_tags")[0]],
                     dep_parents=[ts.get_attrib_tokens("dep_parents")[0]],
                     dep_labels=[ts.get_attrib_tokens("dep_labels")[0]],
                     page=[min(ts.get_attrib_tokens("page"))]
                     if ts.sentence.is_visual()
                     else [None],
                     top=[min(ts.get_attrib_tokens("top"))]
                     if ts.sentence.is_visual()
                     else [None],
                     left=[max(ts.get_attrib_tokens("left"))]
                     if ts.sentence.is_visual()
                     else [None],
                     bottom=[min(ts.get_attrib_tokens("bottom"))]
                     if ts.sentence.is_visual()
                     else [None],
                     right=[max(ts.get_attrib_tokens("right"))]
                     if ts.sentence.is_visual()
                     else [None],
                     meta=None,
                 )
 def apply(self, session, context):
     for ts in OmniNgrams.apply(self, session, context):
         m = re.match(
             u'^([\+\-\u2010\u2011\u2012\u2013\u2014\u2212\uf02d])?(\s*)(\d+)$',
             ts.get_span(), re.U)
         if m:
             if m.group(1) is None:
                 temp = ''
             elif m.group(1) == '+':
                 if m.group(2) != '':
                     continue  # If bigram '+ 150' is seen, accept the unigram '150', not both
                 temp = ''
             else:  # m.group(1) is a type of negative sign
                 # A bigram '- 150' is different from unigram '150', so we keep the implicit '-150'
                 temp = '-'
             temp += m.group(3)
             yield TemporaryImplicitSpan(
                 sentence=ts.sentence,
                 char_start=ts.char_start,
                 char_end=ts.char_end,
                 expander_key=u'temp_expander',
                 position=0,
                 text=temp,
                 words=[temp],
                 lemmas=[temp],
                 pos_tags=[ts.get_attrib_tokens('pos_tags')[-1]],
                 ner_tags=[ts.get_attrib_tokens('ner_tags')[-1]],
                 dep_parents=[ts.get_attrib_tokens('dep_parents')[-1]],
                 dep_labels=[ts.get_attrib_tokens('dep_labels')[-1]],
                 page=[ts.get_attrib_tokens('page')[-1]]
                 if ts.sentence.is_visual() else [None],
                 top=[ts.get_attrib_tokens('top')[-1]]
                 if ts.sentence.is_visual() else [None],
                 left=[ts.get_attrib_tokens('left')[-1]]
                 if ts.sentence.is_visual() else [None],
                 bottom=[ts.get_attrib_tokens('bottom')[-1]]
                 if ts.sentence.is_visual() else [None],
                 right=[ts.get_attrib_tokens('right')[-1]]
                 if ts.sentence.is_visual() else [None],
                 meta=None)
         else:
             yield ts
예제 #7
0
 def apply(self, session, context):
     for ts in OmniNgrams.apply(self, session, context):
         if ts.get_span().endswith(".0"):
             value = ts.get_span()[:-2]
             yield TemporaryImplicitSpan(
                 sentence=ts.sentence,
                 char_start=ts.char_start,
                 char_end=ts.char_end,
                 expander_key=u"volt_expander",
                 position=0,
                 text=value,
                 words=[value],
                 lemmas=[value],
                 pos_tags=[ts.get_attrib_tokens("pos_tags")[-1]],
                 ner_tags=[ts.get_attrib_tokens("ner_tags")[-1]],
                 dep_parents=[ts.get_attrib_tokens("dep_parents")[-1]],
                 dep_labels=[ts.get_attrib_tokens("dep_labels")[-1]],
                 page=[ts.get_attrib_tokens("page")[-1]]
                 if ts.sentence.is_visual()
                 else [None],
                 top=[ts.get_attrib_tokens("top")[-1]]
                 if ts.sentence.is_visual()
                 else [None],
                 left=[ts.get_attrib_tokens("left")[-1]]
                 if ts.sentence.is_visual()
                 else [None],
                 bottom=[ts.get_attrib_tokens("bottom")[-1]]
                 if ts.sentence.is_visual()
                 else [None],
                 right=[ts.get_attrib_tokens("right")[-1]]
                 if ts.sentence.is_visual()
                 else [None],
                 meta=None,
             )
         else:
             yield ts