def _next_passage(self): passage = None if self._split_iter is None: try: file = next(self._files_iter) except StopIteration: # Finished iteration raise if isinstance(file, Passage): # Not really a file, but a Passage passage = file else: # A file attempts = self.attempts while not os.path.exists(file): with external_write_mode(file=sys.stderr): if attempts == 0: print("File not found: %s" % file, file=sys.stderr) return None print("Failed reading %s, trying %d more times..." % (file, attempts), file=sys.stderr) time.sleep(self.delay) attempts -= 1 try: passage = file2passage(file) # XML or binary format except (IOError, ParseError) as e: # Failed to read as passage file base, ext = os.path.splitext(os.path.basename(file)) converter = self.converters.get(ext.lstrip(".")) if converter is None: raise IOError( "Could not read %s file. See error message above. " "If this file's format is not %s, try adding '.txt' suffix to read as plain text:" " '%s'" % (ext, ext, file)) from e self._file_handle = open(file, encoding="utf-8") self._split_iter = iter( converter(chain(self._file_handle, [""]), passage_id=base, lang=self.lang)) if self.split: if self._split_iter is None: self._split_iter = (passage, ) self._split_iter = iter( s for p in self._split_iter for s in split2segments( p, is_sentences=self.sentences, lang=self.lang)) if self._split_iter is not None: # Either set before or initialized now try: passage = next(self._split_iter) except StopIteration: # Finished this converter self._split_iter = None if self._file_handle is not None: self._file_handle.close() self._file_handle = None return None return passage
def _next_passage(self): passage = None if self._split_iter is None: try: file = next(self._files_iter) except StopIteration: # Finished iteration raise if isinstance(file, Passage): # Not really a file, but a Passage passage = file else: # A file attempts = 3 while not os.path.exists(file): if attempts == 0: print("File not found: %s" % file, file=sys.stderr) return next(self) print("Failed reading %s, trying %d more times..." % (file, attempts), file=sys.stderr) time.sleep(5) attempts -= 1 try: passage = file2passage(file) # XML or binary format except (IOError, ParseError): # Failed to read as passage file base, ext = os.path.splitext(os.path.basename(file)) converter = self.converters[ext.lstrip(".")] self._file_handle = open(file, encoding="utf-8") self._split_iter = iter( converter(self._file_handle, passage_id=base)) if self.split: if self._split_iter is None: self._split_iter = (passage, ) self._split_iter = iter( s for p in self._split_iter for s in split2segments(p, is_sentences=self.sentences)) if self._split_iter is not None: # Either set before or initialized now try: # noinspection PyTypeChecker passage = next(self._split_iter) except StopIteration: # Finished this converter self._split_iter = None if self._file_handle is not None: self._file_handle.close() self._file_handle = None return next(self) return passage
def _next_passage(self): passage = None if self._split_iter is None: try: file = next(self._files_iter) except StopIteration: # Finished iteration raise if isinstance(file, Passage): # Not really a file, but a Passage passage = file else: # A file attempts = self.attempts while not os.path.exists(file): with external_write_mode(file=sys.stderr): if attempts == 0: print("File not found: %s" % file, file=sys.stderr) return None print("Failed reading %s, trying %d more times..." % (file, attempts), file=sys.stderr) time.sleep(self.delay) attempts -= 1 try: passage = file2passage(file) # XML or binary format except (IOError, ParseError) as e: # Failed to read as passage file base, ext = os.path.splitext(os.path.basename(file)) converter = self.converters.get(ext.lstrip(".")) if converter is None: raise IOError("Could not read %s file. See error message above. " "If this file's format is not %s, try adding '.txt' suffix to read as plain text:" " '%s'" % (ext, ext, file)) from e self._file_handle = open(file, encoding="utf-8") self._split_iter = iter(converter(chain(self._file_handle, [""]), passage_id=base, lang=self.lang)) if self.split: if self._split_iter is None: self._split_iter = (passage,) self._split_iter = iter(s for p in self._split_iter for s in split2segments(p, is_sentences=self.sentences, lang=self.lang)) if self._split_iter is not None: # Either set before or initialized now try: passage = next(self._split_iter) except StopIteration: # Finished this converter self._split_iter = None if self._file_handle is not None: self._file_handle.close() self._file_handle = None return None return passage
def _next_passage(self): passage = None if self._split_iter is None: try: file = next(self._files_iter) except StopIteration: # Finished iteration raise if isinstance(file, Passage): # Not really a file, but a Passage passage = file else: # A file attempts = 3 while not os.path.exists(file): if attempts == 0: print("File not found: %s" % file, file=sys.stderr) return next(self) print("Failed reading %s, trying %d more times..." % (file, attempts), file=sys.stderr) time.sleep(5) attempts -= 1 try: passage = file2passage(file) # XML or binary format except (IOError, ParseError): # Failed to read as passage file base, ext = os.path.splitext(os.path.basename(file)) converter = self.converters[ext.lstrip(".")] self._file_handle = open(file, encoding="utf-8") self._split_iter = iter(converter(self._file_handle, passage_id=base)) if self.split: if self._split_iter is None: self._split_iter = (passage,) self._split_iter = iter(s for p in self._split_iter for s in split2segments(p, is_sentences=self.sentences)) if self._split_iter is not None: # Either set before or initialized now try: # noinspection PyTypeChecker passage = next(self._split_iter) except StopIteration: # Finished this converter self._split_iter = None if self._file_handle is not None: self._file_handle.close() self._file_handle = None return next(self) return passage
def _read(self, file_path: str) -> Iterator[Instance]: # TODO: In the future, I would like to pass to the text_to_instance only a string. # Training data comes pre-tokenized and the text may not. Need to think about it. # Also Need to think how to support reading features from file. # For both issues, maybe another reader is the solution? Sound reasonable. file_list: List[Tuple[str, str]] = list(self._get_file_list(file_path)) if self.shuffle: shuffle(file_list) for file_path, dataset_title in file_list: passage = file2passage(file_path) tokenized_text = [node.text for node in sorted(passage.layer("0").all, key=lambda x: x.position)] id = passage.ID assert id, "Attribute 'id' is required per passage when using this model" lang = passage.attrib.get("lang") assert lang, "Attribute 'lang' is required per passage when using this model" gold_tree = None if "1" in passage._layers: gold_tree = copy.deepcopy(passage) if self.filter_func and self.filter_func(tokenized_text): print(f'Filtering out passage {id}!') continue # TODO: Move it out of the class if len(tokenized_text) > 200: print(f'Filtering out passage {id} with length longer than 200 tokens!') continue # TODO: There is no need to give both the passage object and the tokenized_text, language and id. # Also, if we support text-only input, you don't have these. yield self.text_to_instance(tokenized_text, passage, dataset_title, lang, id, gold_tree)
def get_passages(streusle_file, ucca_path, annotate=True, target='prep', docids=None, ignore=None, diverging_tok=False, token_map={}): unit_counter = 0 for doc_id, doc in get_streusle_docs(streusle_file).items(): ucca_file = ucca_path + '/xml/' + doc_id + '.xml' if (docids and doc_id not in docids): print(f'{doc_id} not reviewed') continue if (ignore and doc_id in ignore): print(f'{doc_id} ignored due to diverging tokenization') continue if not os.path.exists(ucca_file): print(f'{ucca_file}: file does not exist') continue passage = uconv.file2passage(ucca_file) tokens = [tok['word'] for tok in doc['toks']] terminals = passage.layer('0').pairs term2tok = {} tok2term = {} if diverging_tok: j = 0 acc = '' for i, (_, t) in enumerate(terminals): term2tok[i] = j tok2term[j] = i if j >= len(tokens): assert False, (t.text, i, j, len(tokens), tokens) tok = tokens[j] mapped = token_map.get(tok, tok) if mapped.startswith(t.text): acc = t.text elif t.text in mapped: acc += t.text else: acc = t.text if acc == mapped: j += 1 assert acc in mapped, (acc, mapped) else: diff_term_tok = len(terminals) - len(tokens) if diff_term_tok != 0: for (_, term), tok in zip(terminals, tokens): assert tok == term.text if diff_term_tok > 0: term2tok = tok2term = dict(enumerate(range(len(tokens)))) else: term2tok = tok2term = dict(enumerate(range(len(terminals)))) #assert len(terminals) == len( # tokens), f'unequal number of UCCA terminals and SNACS tokens: {[t.text for _, t in terminals]}, {tokens}' # for x, y in sorted(term2tok.items()): # print(terminals[x][1].text, tokens[y]) # assert len(terminals) == len( # tokens), f'unequal number of UCCA terminals and SNACS tokens: {[t.text for _, t in terminals]}, {tokens}' doc['ends'] = [tok2term[e - 1] + 1 for e in doc['ends']] if annotate: for i, (_, term) in enumerate(terminals): if i not in term2tok: continue tok = doc['toks'][term2tok[i]] for k, v in tok.items(): if k == 'head' and int(v) > 0: term.extra[k] = str(tok2term[int(v) - 1] + 1) elif k != '#': term.extra[k] = v for unit in list(doc['exprs'].values()): unit_counter += 1 terminal = terminals[tok2term[unit['toknums'][0] - 1]][1] terminal.extra['toknums'] = ' '.join(map(str, unit['toknums'])) terminal.extra['local_toknums'] = ' '.join( map(str, unit['local_toknums'])) terminal.extra['lexlemma'] = unit['lexlemma'] terminal.extra['lexcat'] = unit['lexcat'] if unit['lexcat'] == 'DISC': unit['ss'] == '`d' terminal.extra['config'] = unit['heuristic_relation']['config'] terminal.extra.update(unit['heuristic_relation']) terminal.extra['gov'] = None if terminal.extra[ 'gov'] is None else tok2term[int(terminal.extra['gov']) - 1] + 1 terminal.extra['obj'] = None if terminal.extra[ 'obj'] is None else tok2term[int(terminal.extra['obj']) - 1] + 1 if target == 'obj' and unit['heuristic_relation'][ 'obj'] is not None: obj = terminals[unit['heuristic_relation']['obj'] - 1][1] obj.extra['ss'] = unit.get('ss', '') obj.extra['ss2'] = unit.get('ss2', '') else: terminal.extra['ss'] = unit.get('ss', '') terminal.extra['ss2'] = unit.get('ss2', '') # if unit.get('ss', '')[0] == 'p': # unit_counter += 1 yield doc, passage, term2tok
new_str += ']' else: new_str += text[ind] ind += 1 print(new_str) return new_str out_file = open('D_categories_fr.csv', 'a') config_out_file = open('D_configurations_fr.csv', 'a') out_file.write( f'passge_id\tnode_id\tnumChildren\tedge_tag\tedge_id\tedge\tcomposite\tsimple_adverb\tnegation\tmodal_aux\tword_count\tadjective\tdiscontinuous\n' ) totald_in_psg = 0 for i in range(911, 942, 1): converted_psg = convert.file2passage( f'./UCCA_French-20K-master/xml/passage{i}.xml') print(f'passage number {i}') print(converted_psg) # passage object converted from the xml format layers_list = list(converted_psg.layers) layer0 = layers_list[0] layer1 = layers_list[1] for j, one in enumerate(layer1.all): # iterate through the node # get the edge label of this node for e in one.incoming: if e.tag == 'D': # or e.tag == "E": print(j, one.ID, len(list(one.children))) totald_in_psg += 1 print(e.tag, " : ", e.ID, e.child) out_file.write( f'{i}\t{one.ID}\t{len(list(one.children))} \t{e.tag}\t{e.ID}\t{e.child}\t'