def read_conll_depparse(filename): fin = smart_file_handler(filename, 'r') all_sents = [] sent_idx = 0 sent = Sentence(sent_idx) for line_idx, line in enumerate(fin.readlines()): if line == '\n': all_sents.append(deepcopy(sent)) sent_idx += 1 sent = Sentence(sent_idx) else: items = line.strip().split('\t') try: token_idx = int(items[0]) except ValueError: continue if token_idx == sent.num_tokens: log.warning( 'line #{} ({}) has duplicated token index, ignored.'. format(line_idx, line.strip().replace('\t', ' '))) continue word = items[1] lemma = items[2] pos = items[4] sent.add_token(Token(word, lemma, pos)) try: head_idx = int(items[6]) except ValueError: continue dep_label = items[7] if dep_label != 'root': sent.add_dep( Dependency(label=dep_label, head_idx=head_idx - 1, mod_idx=token_idx - 1, extra=False)) if items[8] != '_': for e_dep in items[8].strip().split('|'): try: e_dep_head_idx = int(e_dep.split(':')[0]) except ValueError: continue e_dep_label = ':'.join(e_dep.split(':')[1:]) sent.add_dep( Dependency(label=e_dep_label, head_idx=e_dep_head_idx - 1, mod_idx=token_idx - 1, extra=True)) return all_sents
def read_doc_from_corenlp(filename): log.info('Reading CoreNLP document from {}'.format(filename)) input_xml = smart_file_handler(filename) xml_parser = etree.XMLParser(target=CoreNLPTarget()) sents, corefs = etree.parse(input_xml, xml_parser) doc_name = splitext(basename(filename))[0] doc = Document.construct(doc_name, sents, corefs) input_xml.close() return doc
from utils import smart_file_handler if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('input_path', help='directory to CoreNLP parsed xml files') parser.add_argument('output_path', help='path to write script corpus file') parser.add_argument('-v', '--verbose', help='print all document names', action='store_true') args = parser.parse_args() input_files = sorted([ join(args.input_path, f) for f in listdir(args.input_path) if isfile(join(args.input_path, f)) and f.endswith('xml.bz2') ]) script_corpus = ScriptCorpus() for input_f in input_files: doc = read_corenlp_doc(input_f, verbose=args.verbose) script = Script.from_doc(doc) if script.has_events(): script_corpus.add_script(script) with smart_file_handler(args.output_path, 'w') as fout: fout.write(script_corpus.to_text())
def __iter__(self): for fname in listdir(self.dirname): for line in smart_file_handler(join(self.dirname, fname), 'r'): yield line.split()
def __iter__(self): for filename in self.filenames: for line in smart_file_handler(filename, 'r').readlines(): line = line.strip() if line: yield self.from_text_fn(line)