示例#1
0
    def try_extract(self, _, future):
        """Try to find an address."""
        if not (future[0].word[0].isupper() or future[0].word[0].isdigit()):
            return None

        ent = AddressEntity('', future[0].position)

        ok, idx = self.try_street(future, 0)
        if not ok:
            self.debug('Abort: no street')
            return None
        ent.attrs['ner:addr:street'] = join_tokens(future[:idx])[1]

        ok, idx = self.try_house_number(future, ent, idx)
        if not ok:
            self.debug('Missing house number')
            return None

        if future[idx].word == ',':
            self.debug('Found optional comma')
            idx += 1

        if future[idx].typ == Token.ZIP_CODE:
            ent.attrs['ner:addr:zip_code'] = future[idx].word
            idx += 1
        else:
            self.debug('Missing zip code')

        old_idx = idx
        ok, idx = self.try_town(future, idx)
        if not ok:
            self.debug('Abort: no town')
            return None
        ent.attrs['ner:addr:town'] = join_tokens(future[old_idx:idx])[1]

        found_comma = False
        if future[idx].word == ',':
            self.debug('Found comma after town')
            found_comma = True
            idx += 1

        old_idx = idx
        ok, idx = self.try_country(future, idx)
        if not ok:
            self.debug('No country')
            if found_comma:
                idx -= 1
        else:
            ent.attrs['ner:addr:country'] = join_tokens(future[old_idx:idx])[1]

        tokens = future.popmany(idx)
        ent.whitespace, ent.word = join_tokens(tokens)
        return ent
示例#2
0
    def try_extract(self, _, future):
        """Check if future starts with non-Czech phrase."""
        if not future[0].word[0].isalpha():
            return None

        max_len = 0
        max_lang = None

        for lang in self.words.keys():
            self.debug('Trying %s starting from %s', lang, future[0].word)
            length = self.try_lang(lang, future)
            if length > max_len:
                max_len = length
                max_lang = lang

        if max_len < 2:
            self.debug('Ignore short case')
            return None
        if max([len(future[i].word) for i in range(0, max_len)]) < LIMIT:
            self.debug('Too short words')
            return None

        tokens = future.popmany(max_len)

        whitespace, text = join_tokens(tokens)

        ent = LanguageEntity(text, max_lang, tokens[0].position)
        ent.whitespace = whitespace
        return ent
def read_docs(phase='starting_spans'):
    pmid_groups = {}
    for g in GROUPS:
        pmids = utils.readlines(
            os.path.join(config.EBM_NLP_DIR, 'pmids_{}.txt'.format(g)))
        for pmid in pmids:
            pmid_groups[pmid] = g

    def get_e_fname(pmid, e):
        if pmid_groups[pmid] == 'test':
            subdir = os.path.join('test', 'gold')
        else:
            subdir = 'train'
        f = '{}.AGGREGATED.ann'.format(pmid)
        return os.path.join(config.EBM_NLP_DIR, 'annotations', 'aggregated',
                            phase, e, subdir, f)

    docs = []
    for pmid, group in pmid_groups.items():
        tokens = utils.readlines(
            os.path.join(config.EBM_NLP_DIR, 'documents',
                         '{}.tokens'.format(pmid)))
        text, token_offsets = utils.join_tokens(tokens)
        doc = classes.Doc(pmid, text)
        doc.group = group
        for e in ['participants', 'interventions', 'outcomes']:
            label_name = 'GOLD_{}'.format(e[0])
            labels = [int(l) for l in utils.readlines(get_e_fname(pmid, e))]
            for token_i, token_f, l in utils.condense_labels(labels):
                char_i = token_offsets[token_i][0]
                char_f = token_offsets[token_f - 1][1]
                doc.labels[label_name].append(
                    classes.Span(char_i, char_f, text[char_i:char_f]))
        docs.append(doc)
    return docs
示例#4
0
 def try_house_number(self, future, ent, idx):
     """
     Try to recognise house number, optionally with orientation number.
     """
     if not is_number(future[idx].word):
         return False, idx
     if future[idx + 1].word == '/' and is_number(future[idx + 2].word):
         _, num = join_tokens(future[idx:idx + 3])
         ent.attrs['ner:addr:house'] = num
         return True, idx + 3
     self.debug('Found house number <%s>', future[idx].word)
     ent.attrs['ner:addr:house'] = future[idx].word
     return True, idx + 1
示例#5
0
 def try_town(self, future, idx):
     """
     Try to extract town name from stream. It will take the longest
     available name.
     """
     current = []
     longest = 0
     self.debug('Finding town')
     for i in range(min(TOWN_LIMIT, len(future) - idx)):
         current.append(future[idx + i])
         i += 1
         _, word = join_tokens(current)
         self.debug('Testing <%s>', word)
         if word in self.towns:
             longest = i
             self.debug('Found town <%s>', word)
     return (longest > 0, idx + longest)
示例#6
0
    def finalize(self, is_improper):
        """All names have been added, finalize remaining attributes."""
        self.attrs['tag'] = '|'.join(self.possible_tags)

        tok_num = len(self.words)
        while is_improper(self.words[tok_num - 1].word):
            tok_num -= 1
        self.words = self.words[:tok_num]

        self.set_position(self.words[0].position)
        self.whitespace, self.word = join_tokens(self.words)

        lemma_parts = []
        for name_part in self.lemmas:
            opts = [lt[0] for lt in name_part if lt[1] in self.possible_tags]
            if len(opts) > 0:
                lemma_parts.append(most_frequent(opts))
        self.attrs['lemma'] = ' '.join(lemma_parts)
示例#7
0
 def try_street(self, future, idx):
     """
     Try to extract street or town from token stream. The longest option
     will be taken.
     """
     current = []
     longest = 0
     for i in range(min(STREET_LIMIT, len(future) - idx)):
         current.append(future[idx + i])
         i += 1
         _, word = join_tokens(current)
         self.debug('Testing <%s>', word)
         # If there is a period without any spaces around it, add one
         word = re.sub(r'(?<!\s)\.(?!\s)', '. ', word)
         if word in self.streets or word in self.towns:
             longest = i
             self.debug('Found street <%s>', word)
     if future[longest].word == '.':
         longest += 1
     return (longest > 0, idx + longest)
示例#8
0
    def try_extract_entry(self, history, future):
        """
        Try to find a phrase in the future tokens.
        """
        if not is_possible_start(future[0].word):
            return None

        current = []
        last_seq = ''
        longest = 0
        for i in range(0, min(LIMIT, len(future) - 1)):
            current.append(future[i])

            sequence = ''.join([to_join(t) for t in current]).lstrip()
            sequence = sequence.replace(' ,', ',').replace(' .', '.')

            self.debug('current: <%s>', sequence)

            if self.lookup_sequence(sequence):
                longest = i + 1
                last_seq = sequence
                self.debug('Found match of len %d', longest)

        self.debug('Longest match was: %d', longest)
        if not self.check_length(longest, history, future):
            return None

        tokens = future.popmany(longest)

        whitespace, text = join_tokens(tokens)

        ent = PhraseEntity(text, tokens[0].position)
        ent.whitespace = whitespace
        lemma = self.find_lemma(last_seq)
        if lemma:
            ent.attrs['lemma'] = lemma

        ent.add_categories(self.get_categories(last_seq))
        return ent
示例#9
0
 def __init__(self, tokens):
     whitespace, text = join_tokens(tokens)
     ner.NamedEntity.__init__(self, text, None, 'COrganisation', None)
     self.whitespace = whitespace
     self.set_position(tokens[0].position)
     self.set_src("OrganisationNer")
示例#10
0
def build_entity(future, n, typ):
    """Create a pattern entity of type `typ` from `n` tokens."""
    tokens = future.popmany(n)
    whitespace, text = join_tokens(tokens)
    return PatternEntity(text, typ, tokens[0].position, whitespace)
示例#11
0
 def get_word(self):
     """Return actual current name."""
     return join_tokens(self.words)[1]