Пример #1
0
 def _next_passage(self):
     passage = None
     if self._split_iter is None:
         try:
             file = next(self._files_iter)
         except StopIteration:  # Finished iteration
             raise
         if isinstance(file, Passage):  # Not really a file, but a Passage
             passage = file
         else:  # A file
             attempts = self.attempts
             while not os.path.exists(file):
                 with external_write_mode(file=sys.stderr):
                     if attempts == 0:
                         print("File not found: %s" % file, file=sys.stderr)
                         return None
                     print("Failed reading %s, trying %d more times..." %
                           (file, attempts),
                           file=sys.stderr)
                 time.sleep(self.delay)
                 attempts -= 1
             try:
                 passage = file2passage(file)  # XML or binary format
             except (IOError,
                     ParseError) as e:  # Failed to read as passage file
                 base, ext = os.path.splitext(os.path.basename(file))
                 converter = self.converters.get(ext.lstrip("."))
                 if converter is None:
                     raise IOError(
                         "Could not read %s file. See error message above. "
                         "If this file's format is not %s, try adding '.txt' suffix to read as plain text:"
                         " '%s'" % (ext, ext, file)) from e
                 self._file_handle = open(file, encoding="utf-8")
                 self._split_iter = iter(
                     converter(chain(self._file_handle, [""]),
                               passage_id=base,
                               lang=self.lang))
         if self.split:
             if self._split_iter is None:
                 self._split_iter = (passage, )
             self._split_iter = iter(
                 s for p in self._split_iter for s in split2segments(
                     p, is_sentences=self.sentences, lang=self.lang))
     if self._split_iter is not None:  # Either set before or initialized now
         try:
             passage = next(self._split_iter)
         except StopIteration:  # Finished this converter
             self._split_iter = None
             if self._file_handle is not None:
                 self._file_handle.close()
                 self._file_handle = None
             return None
     return passage
Пример #2
0
 def _next_passage(self):
     passage = None
     if self._split_iter is None:
         try:
             file = next(self._files_iter)
         except StopIteration:  # Finished iteration
             raise
         if isinstance(file, Passage):  # Not really a file, but a Passage
             passage = file
         else:  # A file
             attempts = 3
             while not os.path.exists(file):
                 if attempts == 0:
                     print("File not found: %s" % file, file=sys.stderr)
                     return next(self)
                 print("Failed reading %s, trying %d more times..." %
                       (file, attempts),
                       file=sys.stderr)
                 time.sleep(5)
                 attempts -= 1
             try:
                 passage = file2passage(file)  # XML or binary format
             except (IOError, ParseError):  # Failed to read as passage file
                 base, ext = os.path.splitext(os.path.basename(file))
                 converter = self.converters[ext.lstrip(".")]
                 self._file_handle = open(file, encoding="utf-8")
                 self._split_iter = iter(
                     converter(self._file_handle, passage_id=base))
         if self.split:
             if self._split_iter is None:
                 self._split_iter = (passage, )
             self._split_iter = iter(
                 s for p in self._split_iter
                 for s in split2segments(p, is_sentences=self.sentences))
     if self._split_iter is not None:  # Either set before or initialized now
         try:
             # noinspection PyTypeChecker
             passage = next(self._split_iter)
         except StopIteration:  # Finished this converter
             self._split_iter = None
             if self._file_handle is not None:
                 self._file_handle.close()
                 self._file_handle = None
             return next(self)
     return passage
Пример #3
0
 def _next_passage(self):
     passage = None
     if self._split_iter is None:
         try:
             file = next(self._files_iter)
         except StopIteration:  # Finished iteration
             raise
         if isinstance(file, Passage):  # Not really a file, but a Passage
             passage = file
         else:  # A file
             attempts = self.attempts
             while not os.path.exists(file):
                 with external_write_mode(file=sys.stderr):
                     if attempts == 0:
                         print("File not found: %s" % file, file=sys.stderr)
                         return None
                     print("Failed reading %s, trying %d more times..." % (file, attempts), file=sys.stderr)
                 time.sleep(self.delay)
                 attempts -= 1
             try:
                 passage = file2passage(file)  # XML or binary format
             except (IOError, ParseError) as e:  # Failed to read as passage file
                 base, ext = os.path.splitext(os.path.basename(file))
                 converter = self.converters.get(ext.lstrip("."))
                 if converter is None:
                     raise IOError("Could not read %s file. See error message above. "
                                   "If this file's format is not %s, try adding '.txt' suffix to read as plain text:"
                                   " '%s'" % (ext, ext, file)) from e
                 self._file_handle = open(file, encoding="utf-8")
                 self._split_iter = iter(converter(chain(self._file_handle, [""]), passage_id=base, lang=self.lang))
         if self.split:
             if self._split_iter is None:
                 self._split_iter = (passage,)
             self._split_iter = iter(s for p in self._split_iter for s in
                                     split2segments(p, is_sentences=self.sentences, lang=self.lang))
     if self._split_iter is not None:  # Either set before or initialized now
         try:
             passage = next(self._split_iter)
         except StopIteration:  # Finished this converter
             self._split_iter = None
             if self._file_handle is not None:
                 self._file_handle.close()
                 self._file_handle = None
             return None
     return passage
Пример #4
0
 def _next_passage(self):
     passage = None
     if self._split_iter is None:
         try:
             file = next(self._files_iter)
         except StopIteration:  # Finished iteration
             raise
         if isinstance(file, Passage):  # Not really a file, but a Passage
             passage = file
         else:  # A file
             attempts = 3
             while not os.path.exists(file):
                 if attempts == 0:
                     print("File not found: %s" % file, file=sys.stderr)
                     return next(self)
                 print("Failed reading %s, trying %d more times..." % (file, attempts), file=sys.stderr)
                 time.sleep(5)
                 attempts -= 1
             try:
                 passage = file2passage(file)  # XML or binary format
             except (IOError, ParseError):  # Failed to read as passage file
                 base, ext = os.path.splitext(os.path.basename(file))
                 converter = self.converters[ext.lstrip(".")]
                 self._file_handle = open(file, encoding="utf-8")
                 self._split_iter = iter(converter(self._file_handle, passage_id=base))
         if self.split:
             if self._split_iter is None:
                 self._split_iter = (passage,)
             self._split_iter = iter(s for p in self._split_iter for s in
                                     split2segments(p, is_sentences=self.sentences))
     if self._split_iter is not None:  # Either set before or initialized now
         try:
             # noinspection PyTypeChecker
             passage = next(self._split_iter)
         except StopIteration:  # Finished this converter
             self._split_iter = None
             if self._file_handle is not None:
                 self._file_handle.close()
                 self._file_handle = None
             return next(self)
     return passage
    def _read(self, file_path: str) -> Iterator[Instance]:
        # TODO: In the future, I would like to pass to the text_to_instance only a string.
        #  Training data comes pre-tokenized and the text may not. Need to think about it.
        #  Also Need to think how to support reading features from file.
        #  For both issues, maybe another reader is the solution?  Sound reasonable.
        file_list: List[Tuple[str, str]] = list(self._get_file_list(file_path))

        if self.shuffle:
            shuffle(file_list)

        for file_path, dataset_title in file_list:
            passage = file2passage(file_path)

            tokenized_text = [node.text for node in sorted(passage.layer("0").all, key=lambda x: x.position)]

            id = passage.ID
            assert id, "Attribute 'id' is required per passage when using this model"

            lang = passage.attrib.get("lang")
            assert lang, "Attribute 'lang' is required per passage when using this model"

            gold_tree = None
            if "1" in passage._layers:
                gold_tree = copy.deepcopy(passage)

            if self.filter_func and self.filter_func(tokenized_text):
                print(f'Filtering out passage {id}!')
                continue

            # TODO: Move it out of the class
            if len(tokenized_text) > 200:
                print(f'Filtering out passage {id} with length longer than 200 tokens!')
                continue

            # TODO: There is no need to give both the passage object and the tokenized_text, language and id.
            #  Also, if we support text-only input, you don't have these.
            yield self.text_to_instance(tokenized_text, passage, dataset_title, lang, id, gold_tree)
Пример #6
0
def get_passages(streusle_file,
                 ucca_path,
                 annotate=True,
                 target='prep',
                 docids=None,
                 ignore=None,
                 diverging_tok=False,
                 token_map={}):

    unit_counter = 0

    for doc_id, doc in get_streusle_docs(streusle_file).items():
        ucca_file = ucca_path + '/xml/' + doc_id + '.xml'
        if (docids and doc_id not in docids):
            print(f'{doc_id} not reviewed')
            continue
        if (ignore and doc_id in ignore):
            print(f'{doc_id} ignored due to diverging tokenization')
            continue
        if not os.path.exists(ucca_file):
            print(f'{ucca_file}: file does not exist')
            continue

        passage = uconv.file2passage(ucca_file)

        tokens = [tok['word'] for tok in doc['toks']]
        terminals = passage.layer('0').pairs

        term2tok = {}
        tok2term = {}

        if diverging_tok:
            j = 0
            acc = ''
            for i, (_, t) in enumerate(terminals):
                term2tok[i] = j
                tok2term[j] = i
                if j >= len(tokens):
                    assert False, (t.text, i, j, len(tokens), tokens)
                tok = tokens[j]
                mapped = token_map.get(tok, tok)
                if mapped.startswith(t.text):
                    acc = t.text
                elif t.text in mapped:
                    acc += t.text
                else:
                    acc = t.text
                if acc == mapped:
                    j += 1
                assert acc in mapped, (acc, mapped)
        else:
            diff_term_tok = len(terminals) - len(tokens)
            if diff_term_tok != 0:
                for (_, term), tok in zip(terminals, tokens):
                    assert tok == term.text
            if diff_term_tok > 0:
                term2tok = tok2term = dict(enumerate(range(len(tokens))))
            else:
                term2tok = tok2term = dict(enumerate(range(len(terminals))))
            #assert len(terminals) == len(
            #        tokens), f'unequal number of UCCA terminals and SNACS tokens: {[t.text for _, t in terminals]}, {tokens}'

        # for x, y in sorted(term2tok.items()):
        #     print(terminals[x][1].text, tokens[y])

        # assert len(terminals) == len(
        #     tokens), f'unequal number of UCCA terminals and SNACS tokens: {[t.text for _, t in terminals]}, {tokens}'

        doc['ends'] = [tok2term[e - 1] + 1 for e in doc['ends']]

        if annotate:
            for i, (_, term) in enumerate(terminals):
                if i not in term2tok: continue
                tok = doc['toks'][term2tok[i]]
                for k, v in tok.items():
                    if k == 'head' and int(v) > 0:
                        term.extra[k] = str(tok2term[int(v) - 1] + 1)
                    elif k != '#':
                        term.extra[k] = v

            for unit in list(doc['exprs'].values()):

                unit_counter += 1

                terminal = terminals[tok2term[unit['toknums'][0] - 1]][1]
                terminal.extra['toknums'] = ' '.join(map(str, unit['toknums']))
                terminal.extra['local_toknums'] = ' '.join(
                    map(str, unit['local_toknums']))
                terminal.extra['lexlemma'] = unit['lexlemma']
                terminal.extra['lexcat'] = unit['lexcat']
                if unit['lexcat'] == 'DISC':
                    unit['ss'] == '`d'
                terminal.extra['config'] = unit['heuristic_relation']['config']
                terminal.extra.update(unit['heuristic_relation'])
                terminal.extra['gov'] = None if terminal.extra[
                    'gov'] is None else tok2term[int(terminal.extra['gov']) -
                                                 1] + 1
                terminal.extra['obj'] = None if terminal.extra[
                    'obj'] is None else tok2term[int(terminal.extra['obj']) -
                                                 1] + 1
                if target == 'obj' and unit['heuristic_relation'][
                        'obj'] is not None:
                    obj = terminals[unit['heuristic_relation']['obj'] - 1][1]
                    obj.extra['ss'] = unit.get('ss', '')
                    obj.extra['ss2'] = unit.get('ss2', '')
                else:
                    terminal.extra['ss'] = unit.get('ss', '')
                    terminal.extra['ss2'] = unit.get('ss2', '')

                # if unit.get('ss', '')[0] == 'p':
                #     unit_counter += 1

        yield doc, passage, term2tok
Пример #7
0
            new_str += ']'
        else:
            new_str += text[ind]
            ind += 1
    print(new_str)
    return new_str


out_file = open('D_categories_fr.csv', 'a')
config_out_file = open('D_configurations_fr.csv', 'a')
out_file.write(
    f'passge_id\tnode_id\tnumChildren\tedge_tag\tedge_id\tedge\tcomposite\tsimple_adverb\tnegation\tmodal_aux\tword_count\tadjective\tdiscontinuous\n'
)
totald_in_psg = 0
for i in range(911, 942, 1):
    converted_psg = convert.file2passage(
        f'./UCCA_French-20K-master/xml/passage{i}.xml')
    print(f'passage number {i}')
    print(converted_psg)  # passage object converted from the xml format
    layers_list = list(converted_psg.layers)
    layer0 = layers_list[0]
    layer1 = layers_list[1]

    for j, one in enumerate(layer1.all):  # iterate through the node
        # get the edge label of this node
        for e in one.incoming:
            if e.tag == 'D':  # or e.tag == "E":
                print(j, one.ID, len(list(one.children)))
                totald_in_psg += 1
                print(e.tag, " : ", e.ID, e.child)
                out_file.write(
                    f'{i}\t{one.ID}\t{len(list(one.children))} \t{e.tag}\t{e.ID}\t{e.child}\t'