def consolidate_NER_results(final_sequences, text):
    """
    Function that from a list of sequences returned from the NER function is updated with spans
    :param final_sequences: Sequences returned from NER function. Sequence is a array of arrays of tokens in format (token,label).
    :param text: full text article
    :return: a list of tuples that includes spans in the following format: (token,label,span_begin,span_end)
    """
    tokens = []
    for a in final_sequences:
        for b in a:
            tokens.append(b[0])
    spans = align_tokens(tokens, text)
    fin = []
    multiplier = 0
    for i in range(0, len(final_sequences)):
        #multiplier = 0
        if i > 0:
            multiplier = multiplier + len(final_sequences[i-1])
            #subtractor = 1
        for j in range(0, len(final_sequences[i])):
            token = final_sequences[i][j][0]
            label = final_sequences[i][j][1]
            span_min = spans[multiplier+j][0]
            span_max = spans[multiplier+j][1]
            fin.append((token, label, span_min, span_max))
    return fin
示例#2
0
    def span_tokenize(self, text):
        """
        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

            >>> from nltk.tokenize import TreebankWordTokenizer
            >>> s = '''Good muffins cost $3.88\\nin New (York).  Please (buy) me\\ntwo of them.\\n(Thanks).'''
            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
            >>> TreebankWordTokenizer().span_tokenize(s) == expected
            True
            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
            True

        """
        raw_tokens = self.tokenize(text)

        # Convert converted quotes back to original double quotes
        # Do this only if original text contains double quote(s)
        if '"' in text:
            # Find double quotes and converted quotes
            matched = [m.group() for m in re.finditer(r'[(``)(\'\')(")]+', text)]
            
            # Replace converted quotes back to double quotes
            tokens = [matched.pop(0) if tok in ['"', "``", "''"] else tok for tok in raw_tokens]
        else:
            tokens = raw_tokens

        return align_tokens(tokens, text)
示例#3
0
文件: utils.py 项目: AmyOlex/Chrono
def getWhitespaceTokens(file_path):
    file = open(file_path, "r")
    raw_text = file.read()
    ## Testing the replacement of all "=" signs by spaces before tokenizing.
    text = raw_text.translate(str.maketrans("=", ' '))

    ## Tokenize the sentences
    sentences = sent_tokenize(text)

    ## Get spans of the sentences
    sent_spans = align_tokens(sentences, text)

    ## create empty arrays for white space tokens and sentence delimiters
    tokenized_text = []
    text_spans = []

    ## Loop through each sentence and get the tokens and token spans
    for s in range(0, len(sentences)):
        # get the tokens and token spans within the sentence
        toks = WhitespaceTokenizer().tokenize(sentences[s])
        span_generator = WhitespaceTokenizer().span_tokenize(sentences[s])
        rel_spans = [span for span in span_generator]

        # convert the relative spans into absolute spans
        abs_spans = []
        for start, end in rel_spans:
            abs_spans = abs_spans + [
                (sent_spans[s][0] + start, sent_spans[s][0] + end)
            ]

        tokenized_text = tokenized_text + toks
        text_spans = text_spans + abs_spans

    ## Now we have the token list and the spans.  We should be able to continue finding sentnence boundaries as before
    tags = nltk.pos_tag(tokenized_text)
    sent_boundaries = [0] * len(tokenized_text)

    ## figure out which tokens are at the end of a sentence
    tok_counter = 0

    for s in range(0, len(sentences)):
        sent = sentences[s]

        if "\n" in sent:
            sent_newline = sent.split("\n")
            for sn in sent_newline:
                sent_split = WhitespaceTokenizer().tokenize(sn)
                nw_idx = len(sent_split) + tok_counter - 1
                sent_boundaries[nw_idx] = 1
                tok_counter = tok_counter + len(sent_split)

        else:
            sent_split = WhitespaceTokenizer().tokenize(sent)
            nw_idx = len(sent_split) + tok_counter - 1
            sent_boundaries[nw_idx] = 1
            tok_counter = tok_counter + len(sent_split)

    return raw_text, text, tokenized_text, text_spans, tags, sent_boundaries
示例#4
0
 def tokenize(self):
     sents = self.sent_tokenizer.tokenize(self.text)
     sent_spans = self.sent_tokenizer.span_tokenize(self.text)
     tokens = [self.tokenizer.tokenize(sent) for sent in sents]
     idxs = [
         align_tokens(['"' if x in ['``', "''"] else x for x in toks], sent)
         for sent, toks in zip(sents, tokens)
     ]
     return sents, tokens, idxs, sent_spans
def custom_span_tokenize(text, language='english', preserve_line=False):
    tokens = custom_word_tokenize(text)
    tokens = ['"' if tok in ['``',"''"] else tok for tok in tokens]
    return align_tokens(tokens, text)

#print(custom_span_tokenize("He was a 47-year-old man born on 10/12/1975. His phone number is 170-574-2276"))

# documents = readSurrogate("../Datasets/i2b2_data/training-PHI-Gold-Set1")
# documents = tokenize(documents)
# print("Hi")
示例#6
0
 def span_tokenize(self, string):
     if self.__tokenizer == 'nltk':
         raw_tokens = nltk.word_tokenize(string)
         if ('"' in string) or ("''" in string):
             matched = [m.group() for m in re.finditer(r"``|'{2}|\"", string)]
             tokens = [matched.pop(0) if tok in ['"', "``", "''"] else tok for tok in raw_tokens]
         else:
             tokens = raw_tokens
         spans = align_tokens(tokens, string)
     return spans
示例#7
0
    def span_tokenize(self, text):
        """
        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

            >>> from nltk.tokenize import TreebankWordTokenizer
            >>> s = '''Good muffins cost $3.88\\nin New (York).  Please (buy) me\\ntwo of them.\\n(Thanks).'''
            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
            >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
            True
            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
            True

            Additional example
            >>> from nltk.tokenize import TreebankWordTokenizer
            >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\\n each in New (York)."'''
            >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
            ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
            ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
            ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
            ... (82, 83), (83, 84)]
            >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
            True
            >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
            ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
            ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
            True

        """
        raw_tokens = self.tokenize(text)

        # Convert converted quotes back to original double quotes
        # Do this only if original text contains double quote(s) or double
        # single-quotes (because '' might be transformed to `` if it is
        # treated as starting quotes).
        if ('"' in text) or ("''" in text):
            # Find double quotes and converted quotes
            matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]

            # Replace converted quotes back to double quotes
            tokens = [
                matched.pop(0) if tok in ['"', "``", "''"] else tok
                for tok in raw_tokens
            ]
        else:
            tokens = raw_tokens

        for tok in align_tokens(tokens, text):
            yield tok
示例#8
0
 def __call__(self, doc, **kwargs):
     if doc.text is None:
         return doc
     if self.has_span_tokenize:
         spans = self.tokenizer.span_tokenize(doc.text)
     else:
         tks = self.tokenizer.tokenize(doc.text)
         spans = align_tokens(tks, doc.text)
     annset = doc.annset(self.out_set)
     for span in spans:
         annset.add(span[0], span[1], self.token_type)
     return doc
def custom_span_tokenize(text, language='english', preserve_line=True):
    """
            Returns a spans of tokens in text.

            :param text: text to split into words
            :param language: the model name in the Punkt corpus
            :type language: str
            :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.
            :type preserver_line: bool
            """
    tokens = custom_word_tokenize(text)
    tokens = ['"' if tok in ['``', "''"] else tok for tok in tokens]
    return align_tokens(tokens, text)
示例#10
0
def tokenize(text):
    sents = sent_tokenizer.tokenize(text)
    sent_spans = sent_tokenizer.span_tokenize(text)
    tokens = [tokenizer.tokenize(sent) for sent in sents]
    idxs = [
        align_tokens([
            x.replace('``', '"').replace("''", '"')
            if '``' in x or "''" in x else x for x in toks
        ],
                     sent.replace('``', '"').replace("''", '"'))
        for sent, toks in zip(sents, tokens)
    ]
    return sents, tokens, idxs, sent_spans
    def _process(self, input_pack: DataPack):
        inputs = self.tokenizer(input_pack.text, return_tensors="pt")
        tokens = self.tokenizer.convert_ids_to_tokens(
            inputs['input_ids'][0].tolist())[1:-1]
        tokens_clean = [
            token.replace('##', '') if token.startswith('##') else token
            for token in tokens
        ]

        for i, (begin, end) in enumerate(
                align_tokens(tokens_clean, input_pack.text.lower())):
            subword = Subword(input_pack, begin, end)
            subword.is_subword = tokens[i].startswith('##')
示例#12
0
    def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
        r"""
        Returns the spans of the tokens in ``text``.
        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

            >>> from nltk.tokenize import TreebankWordTokenizer
            >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
            >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
            True
            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
            True

        :param text: A string with a sentence or sentences.
        :type text: str
        :yield: Tuple[int, int]
        """
        raw_tokens = self.tokenize(text)

        # Convert converted quotes back to original double quotes
        # Do this only if original text contains double quote(s) or double
        # single-quotes (because '' might be transformed to `` if it is
        # treated as starting quotes).
        if ('"' in text) or ("''" in text):
            # Find double quotes and converted quotes
            matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]

            # Replace converted quotes back to double quotes
            tokens = [
                matched.pop(0) if tok in ['"', "``", "''"] else tok
                for tok in raw_tokens
            ]
        else:
            tokens = raw_tokens

        yield from align_tokens(tokens, text)
示例#13
0
    def span_tokenize(self, text):
        """
        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

            >>> from nltk.tokenize import TreebankWordTokenizer
            >>> s = '''Good muffins cost $3.88\\nin New (York).  Please (buy) me\\ntwo of them.\\n(Thanks).'''
            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
            >>> TreebankWordTokenizer().span_tokenize(s) == expected
            True
            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
            True

        """
        tokens = self.tokenize(text)
        return align_tokens(tokens, text)
    def span_tokenize(self, text):
        """
        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

            >>> from nltk.tokenize import TreebankWordTokenizer
            >>> s = '''Good muffins cost $3.88\\nin New (York).  Please (buy) me\\ntwo of them.\\n(Thanks).'''
            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
            >>> TreebankWordTokenizer().span_tokenize(s) == expected
            True
            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
            True

        """
        tokens = self.tokenize(text)
        return align_tokens(tokens, text)
示例#15
0
    def span_tokenize(self, text):
        """
        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

            >>> from nltk.tokenize import TreebankWordTokenizer
            >>> s = '''Good muffins cost $3.88\\nin New (York).  Please (buy) me\\ntwo of them.\\n(Thanks).'''
            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
            >>> TreebankWordTokenizer().span_tokenize(s) == expected
            True
            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
            True

        """
        raw_tokens = self.tokenize(text)

        # Convert converted quotes back to original double quotes
        # Do this only if original text contains double quote(s)
        if '"' in text:
            # Find double quotes and converted quotes
            matched = [
                m.group() for m in re.finditer(r'[(``)(\'\')(")]+', text)
            ]

            # Replace converted quotes back to double quotes
            tokens = [
                matched.pop(0) if tok in ['"', "``", "''"] else tok
                for tok in raw_tokens
            ]
        else:
            tokens = raw_tokens

        return align_tokens(tokens, text)
def main(args):
    if len(args) < 3:
        sys.stderr.write(
            "Required arguments: <input directory> <rest host> <output directory>\n"
        )
        sys.exit(-1)

    hostname = args[1]

    # initialize rest server
    init_url = 'http://%s:8000/temporal/initialize' % hostname
    process_url = 'http://%s:8000/temporal/process' % hostname

    # sentence segmenter
    rush = RuSH('conf/rush_rules.tsv')
    # tokenizer
    tokenizer = TreebankWordTokenizer()

    r = requests.post(init_url)
    if r.status_code != 200:
        sys.stderr.write('Error: rest init call was not successful\n')
        sys.exit(-1)

    combine_sentences = True
    token_threshold = 100

    for sub_dir, text_name, xml_names in anafora.walk(args[0], xml_name_regex):
        print("Processing filename: %s" % (text_name))
        if len(xml_names) > 1:
            sys.stderr.write(
                'There were multiple valid xml files for file %s\n' %
                (text_name))
            filtered_names = []
            for xml_name in xml_names:
                if 'Relation' in xml_name:
                    filtered_names.append(xml_name)
            if len(filtered_names) == 1:
                sys.stderr.write(
                    'Picking the file with "Relation" in the title: %s\n' %
                    (filtered_names[0]))
                xml_names = filtered_names
            else:
                sys.exit(-1)
        xml_name = xml_names[0]

        section_texts = []
        sentences = []
        text = ''
        with open(os.path.join(args[0], sub_dir, text_name)) as f:
            cur_section = []
            cur_ind = 0
            section_start = 0
            for line in f.readlines():
                text += line
                line_len = len(line)
                line = line.rstrip()
                if line.startswith('[meta') or line.startswith(
                        '[start section') or line.startswith('[end section'):
                    if len(cur_section) > 0:
                        section_texts.append('\n'.join(cur_section))
                        section_text = '\n'.join(cur_section)
                        section_sents = rush.segToSentenceSpans(section_text)
                        if len(section_sents) > 0:
                            section_sents[0].text = '<section>'
                            #section_sents[-1].text = '</section>'
                        for section_sent in section_sents:
                            section_sent.begin += section_start
                            section_sent.end += section_start
                        sentences.extend(section_sents)
                        cur_section = []
                    section_start = cur_ind + line_len
                else:
                    cur_section.append(line)
                cur_ind += line_len

        #sentences = rush.segToSentenceSpans(text)
        sent_tokens = []
        merged_sentences = []

        if combine_sentences:
            for sentence_ind, sentence in enumerate(sentences):
                sent_txt = text[sentence.begin:sentence.end]

                if tb_tokenize:
                    raw_tokens = tokenizer.tokenize(sent_txt)

                    # From https://www.nltk.org/_modules/nltk/tokenize/treebank.html#TreebankWordTokenizer.span_tokenize
                    # Convert converted quotes back to original double quotes
                    # Do this only if original text contains double quote(s) or double
                    # single-quotes (because '' might be transformed to `` if it is
                    # treated as starting quotes).
                    if ('"' in sent_txt) or ("''" in sent_txt):
                        # Find double quotes and converted quotes
                        matched = [
                            m.group()
                            for m in re.finditer(r"``|'{2}|\"", sent_txt)
                        ]

                        # Replace converted quotes back to double quotes
                        tokens = [
                            matched.pop(0) if tok in ['"', "``", "''"] else tok
                            for tok in raw_tokens
                        ]
                    else:
                        tokens = raw_tokens
                else:
                    tokens = tokenize(sent_txt)

                    # fix apostrophe s ('s) to be one token
                    def fix_simple_tokenize(tokens):
                        new_tokens = []
                        ind = 0
                        while ind < len(tokens):
                            if tokens[ind] == "'" and ind + 1 < len(
                                    tokens) and tokens[ind + 1] == 's':
                                new_tokens.append("'s")
                                ind += 2
                            else:
                                new_tokens.append(tokens[ind])
                                ind += 1

                        return new_tokens

                    tokens = fix_simple_tokenize(tokens)

                if text[sentence.end] == '\n':
                    tokens.append('<cr>')

                # print("Sentence number %d has %d tokens" % (sentence_ind, len(tokens)))

                if len(sent_tokens) > 0 and (
                        len(sent_tokens[-1]) +
                        len(tokens)) < token_threshold and sentence.text == '':
                    sent_tokens[-1].extend(tokens)
                    merged_sentences[-1].end = sentence.end
                else:
                    sent_tokens.append(tokens)
                    merged_sentences.append(sentence)
            for tokens in sent_tokens:
                while tokens[-1] == "<cr>":
                    tokens.pop()

            sentences = merged_sentences
        else:
            for sentence in sentences:
                sent_txt = text[sentence.begin:sentence.end]
                sent_tokens.append(tokenize(sent_txt))

        r = requests.post(process_url,
                          json={
                              'sent_tokens': sent_tokens,
                              'metadata': text_name
                          })
        if r.status_code != 200:
            sys.stderr.write('Error: rest call was not successful\n')
            sys.exit(-1)

        json = r.json()
        anafora_data = AnaforaData()
        cur_id = 0
        rel_id = 0

        for sent_ind, sentence in enumerate(sentences):
            sent_txt = text[sentence.begin:sentence.end]
            sent_events = json['events'][sent_ind]
            sent_timexes = json['timexes'][sent_ind]
            sent_rels = json['relations'][sent_ind]
            event_ids = []
            timex_ids = []

            meta_rev_loc = sent_txt.find('[meta rev_date')
            if meta_rev_loc >= 0:
                meta_rev_end = sent_txt.find(']', meta_rev_loc)
                meta_rev_loc += sentence.begin
                meta_rev_end += sentence.begin

            # Replace <cr> with empty string so that tokens align again,
            # then after alignment add them back in so token offsets from classifier are correct.
            cr_token_inds = []
            num_crs_at_position = []
            for ind in range(len(sent_tokens[sent_ind])):
                num_crs_at_position.append(len(cr_token_inds))
                if sent_tokens[sent_ind][ind] == '<cr>':
                    cr_token_inds.append(ind)
                    sent_tokens[sent_ind][ind] = ''

            try:
                token_spans = align_tokens(sent_tokens[sent_ind], sent_txt)
            except Exception as e:
                sys.stderr.write(
                    'In document %s, error \n%s\n processing sentence:\n*****\n%s\n******\n'
                    % (text_name, str(e), sent_txt))
                sys.exit(-1)

            for event in sent_events:
                begin_token_ind = event['begin']
                end_token_ind = event['end']
                dtr = event['dtr']
                event_start_offset = token_spans[
                    begin_token_ind +
                    num_crs_at_position[begin_token_ind]][0] + sentence.begin
                event_end_offset = token_spans[
                    end_token_ind +
                    num_crs_at_position[end_token_ind]][1] + sentence.begin
                event_text = text[event_start_offset:event_end_offset]

                annot = AnaforaEntity()
                annot.id = str(cur_id) + "@e@" + text_name

                if event_text.endswith('_date'):
                    annot.properties['datesectiontime'] = 'True'
                    event_ids.append(-1)
                else:
                    event_ids.append(annot.id)
                    annot.spans = ((event_start_offset, event_end_offset), )
                    annot.type = "EVENT"
                    annot.properties['DocTimeRel'] = dtr
                    anafora_data.annotations.append(annot)

                cur_id += 1

                #print("Found event %s" % (event_text))

            for timex in sent_timexes:
                begin_token_ind = timex['begin']
                end_token_ind = timex['end']
                time_class = timex['timeClass']
                timex_start_offset = token_spans[
                    begin_token_ind +
                    num_crs_at_position[begin_token_ind]][0] + sentence.begin
                timex_end_offset = token_spans[
                    end_token_ind +
                    num_crs_at_position[end_token_ind]][1] + sentence.begin
                timex_text = text[timex_start_offset:timex_end_offset]

                if meta_rev_loc >= 0 and timex_start_offset > meta_rev_loc and timex_end_offset < meta_rev_end:
                    timex_ids.append(-1)
                elif time_class == 'SECTIONTIME':
                    timex_ids.append(-1)
                elif not re.match(r'\d{5}', timex_text) is None:
                    timex_ids.append(-1)
                else:
                    # create anafora entry
                    annot = AnaforaEntity()
                    annot.id = str(cur_id) + "@e@" + text_name
                    timex_ids.append(annot.id)
                    cur_id += 1
                    annot.spans = ((timex_start_offset, timex_end_offset), )
                    annot.type = "TIMEX3"
                    annot.properties['Class'] = time_class
                    anafora_data.annotations.append(annot)

                #print("Found timex %s" % (timex_text))

            if not 'path' in text_name.lower():
                # no relations in pathology notes, so if we find any they are false positives.
                for rel in sent_rels:
                    arg1_type, arg1_ind = rel['arg1'].split('-')
                    arg2_type, arg2_ind = rel['arg2'].split('-')
                    if arg1_type == 'EVENT':
                        arg1 = event_ids[int(arg1_ind)]
                    elif arg1_type == 'TIMEX':
                        arg1 = timex_ids[int(arg1_ind)]

                    if arg1 == -1:
                        continue

                    if arg2_type == 'EVENT':
                        arg2 = event_ids[int(arg2_ind)]
                    elif arg2_type == 'TIMEX':
                        arg2 = timex_ids[int(arg2_ind)]

                    if arg2 == -1:
                        continue

                    reln = AnaforaRelation()
                    reln.id = str(rel_id) + '@r@' + text_name
                    rel_id += 1
                    reln.type = 'TLINK'
                    reln.properties['Type'] = rel['category']
                    reln.properties['Source'] = arg1
                    reln.properties['Target'] = arg2

                    anafora_data.annotations.append(reln)

        #break
        anafora_data.indent()
        os.makedirs(os.path.join(args[2], sub_dir), exist_ok=True)
        anafora_data.to_file(os.path.join(args[2], sub_dir, xml_name))
示例#17
0
 def span_tokenize(self, string):
     if self.__tokenizer == 'jieba':
         tokens = self.tokenize(string)
         spans = align_tokens(tokens, string)
     return spans
示例#18
0
文件: nlp.py 项目: buriy/sberbank-qa
    def align_tokens(self):
        if self.tokens_spans is None:
            self.tokens_spans = align_tokens([t.word for t in self.tokens],
                                             self.text)

        return self.tokens_spans
示例#19
0
def main(args):
    if len(args) < 3:
        sys.stderr.write(
            "Required arguments: <input directory> <rest host> <output directory>\n"
        )
        sys.exit(-1)

    hostname = args[1]

    # initialize rest server
    init_url = 'http://%s:8000/temporal/initialize' % hostname
    process_url = 'http://%s:8000/temporal/process' % hostname

    # sentence segmenter
    rush = RuSH('conf/rush_rules.tsv')
    # tokenizer
    # tokenizer = TreebankWordTokenizer()

    r = requests.post(init_url)
    if r.status_code != 200:
        sys.stderr.write('Error: rest init call was not successful\n')
        sys.exit(-1)

    for sub_dir, text_name, xml_names in anafora.walk(args[0], xml_name_regex):
        print("Processing filename: %s" % (text_name))
        if len(xml_names) > 1:
            sys.stderr.write(
                'There were multiple valid xml files for file %s' %
                (text_name))
            sys.exit(-1)
        xml_name = xml_names[0]

        with open(os.path.join(args[0], sub_dir, text_name)) as f:
            text = f.read()

        sentences = rush.segToSentenceSpans(text)
        sent_tokens = []

        for sentence in sentences:
            sent_txt = text[sentence.begin:sentence.end]
            sent_tokens.append(tokenize(sent_txt))

        r = requests.post(process_url, json={'sent_tokens': sent_tokens})
        if r.status_code != 200:
            sys.stderr.write('Error: rest call was not successful\n')
            sys.exit(-1)

        json = r.json()
        anafora_data = AnaforaData()
        cur_id = 0

        for sent_ind, sentence in enumerate(sentences):
            sent_txt = text[sentence.begin:sentence.end]
            sent_events = json['events'][sent_ind]
            sent_timexes = json['timexes'][sent_ind]
            try:
                token_spans = align_tokens(sent_tokens[sent_ind], sent_txt)
            except Exception as e:
                sys.stderr.write(
                    'In document %s, error \n%s\n processing sentence:\n*****\n%s\n******\n'
                    % (text_name, str(e), sent_txt))
                sys.exit(-1)

            for event in sent_events:
                begin_token_ind = event['begin']
                end_token_ind = event['end']
                dtr = event['dtr']
                event_start_offset = token_spans[begin_token_ind][
                    0] + sentence.begin
                event_end_offset = token_spans[end_token_ind][
                    1] + sentence.begin
                event_text = text[event_start_offset:event_end_offset]
                annot = AnaforaEntity()
                annot.id = str(cur_id) + "@e@" + text_name
                cur_id += 1
                annot.spans = ((event_start_offset, event_end_offset), )
                annot.type = "EVENT"
                annot.properties['DocTimeRel'] = dtr
                anafora_data.annotations.append(annot)

                #print("Found event %s" % (event_text))

            for timex in sent_timexes:
                begin_token_ind = timex['begin']
                end_token_ind = timex['end']
                time_class = timex['timeClass']
                timex_start_offset = token_spans[begin_token_ind][
                    0] + sentence.begin
                timex_end_offset = token_spans[end_token_ind][
                    1] + sentence.begin
                timex_text = text[timex_start_offset:timex_end_offset]

                # create anafora entry
                annot = AnaforaEntity()
                annot.id = str(cur_id) + "@e@" + text_name
                cur_id += 1
                annot.spans = ((timex_start_offset, timex_end_offset), )
                annot.type = "TIMEX3"
                annot.properties['Class'] = time_class
                anafora_data.annotations.append(annot)

                #print("Found timex %s" % (timex_text))

        #break
        anafora_data.indent()
        os.makedirs(os.path.join(args[2], sub_dir), exist_ok=True)
        anafora_data.to_file(os.path.join(args[2], sub_dir, xml_name))
示例#20
0
    def get_usable_tokens(sent: str) -> List[TaggedToken]:
        # TODO: complexity is bad!
        # TODO: stricter linting rules?
        tokens = word_tokenize(sent)

        quote_pattern = r"``|''|\""
        quotes = [m.group() for m in re.finditer(quote_pattern, sent)]
        restored_tokens = [
            quotes.pop(0) if re.match(quote_pattern, tok) else tok
            for tok in tokens
        ]

        token_spans = align_tokens(restored_tokens, sent)
        tagged: List[str, POS] = pos_tag(tokens)

        regulars: Dict[Span, POS] = {}
        for i, span in enumerate(token_spans):
            regulars[span] = tagged[i][1] if tagged[i][1] not in literalised_pos else 'LITERAL'

        word_spans = Corpifier.span_word(sent)

        irregulars = {}  # irregular span to index
        for i, s in enumerate(token_spans):
            if s not in word_spans:
                irregulars[s] = i

        def merge_irregular_spans(spans: Dict[Span, int]) -> Set[Span]:
            spans_t = spans.items()
            shift: List[Tuple[Span, List[int]]] = []  # span of words and
            for s, i in spans_t:
                if len(shift) == 0:
                    shift.append((s, [i]))
                    continue

                if s[0] == shift[len(shift) - 1][0][1]:
                    prev = shift.pop(len(shift) - 1)
                    prev_span, ind = prev[0], prev[1]
                    new_span = (prev_span[0], s[1])
                    ind.append(i)
                    shift.append((new_span, ind))
                else:
                    shift.append((s, [i]))

            ret: Set[Span] = set()
            for span, _ in shift:
                ret.add(span)
            return ret

        merged_irregulars = merge_irregular_spans(irregulars)

        ret: List[TaggedToken] = []
        for i, w_span in enumerate(word_spans):
            tag: POS
            word: str = sent[w_span[0]:w_span[1]]
            spaced = not (i < len(word_spans) - 1 and w_span[1] == word_spans[i + 1][0])
            if w_span in merged_irregulars:
                if word.lower() in apostrophised:
                    tag = apostrophised[word.lower()]
                else:
                    tag = 'LITERAL'
            elif w_span in regulars:
                tag = regulars[w_span]
            else:
                tag = 'LITERAL'
                # print("unexpected literal: " + word + " from " + sent)
            ret.append(TaggedToken(word, tag, spaced))
        return ret