def testDefault(self): logging.getLogger().addHandler(logging.StreamHandler()) # might spam your console... self.interceptLogs('otplc.extractor') self.otpl_file.write( "This DT 6 nsubj B-NP NULL\n" "is VBZ 6 cop B-VP NULL\n" "Florian NNP 6 nn B-NP NULL\n" "ʼs POS 3 pos I-NP db:id\n" "weird JJ 6 amod I-NP NULL\n" "test NN 0 root I-NP NULL\n" ". DOT 6 punct O NULL\n\n" "And DT 6 nsubj B-NP NULL\n" "another VBZ 6 cop B-VP NULL\n" "one NN 0 root I-NP NULL\n" ". DOT 6 punct O NULL\n\n" ) self.otpl_file.close() expected = 'This is Florian ʼs weird test .\nAnd another one .\n' self.assertEqual(0, otpl_to_text(Configuration([self.otpl_file.name]))) result = open(make_path_to(self.otpl_file.name, Configuration.TEXT_SUFFIX)).read() self.assertEqual(expected, result)
def otpl_to_text(configuration): """ Extract the text using the tokens of the OTPL files and store the results into separate plain-text files. :param configuration: a :class:`otplc.settings.Configuration` object :return: The number of failed conversion for the input files. """ errors = 0 for otpl_file in configuration.input_files: text_file = make_path_to(otpl_file, configuration.text_suffix) msg = "output text file and input OTPL file have the same path " \ "(ensure the OTPL file does not use the extension '{}')" assert otpl_file != text_file, msg.format(configuration.text_suffix) segments = configure_reader(otpl_file, configuration) if segments is None: errors += 1 continue if configuration.colspec is None: configuration.colspec = guess_colspec(segments) token = configuration.colspec.token try: with open(text_file, encoding=configuration.encoding, mode='wt') as out_stream: for seg in segments: print(*[row[token] for row in seg], file=out_stream) except IOError as e: L.error('I/O error while extracting %s to %s: %s', otpl_file, text_file, str(e)) errors += 1 return errors