def regenerate_dat(input_tuple, stage): """Regenerates the original dat from conformer and compares it to original. Args: input_tuple: tuple of string (original contents), dataset_pb2.Conformer stage: string 'stage1' or 'stage2' Returns: original_dat, conformer, regenerated dat, int (0=mismatch, 1=match) """ original_dat, conformer = input_tuple smu_writer = smu_writer_lib.SmuWriter(annotate=False) if stage == 'stage1': regen_dat = smu_writer.process_stage1_proto(conformer) else: regen_dat = smu_writer.process_stage2_proto(conformer) try: smu_writer_lib.check_dat_formats_match(original_dat.splitlines(), regen_dat.splitlines()) beam.metrics.Metrics.counter(_METRICS_NAMESPACE, stage + '_dat_format_matched').inc() return original_dat, conformer, regen_dat, 1 except smu_writer_lib.DatFormatMismatchError: beam.metrics.Metrics.counter(_METRICS_NAMESPACE, stage + '_dat_format_mismatched').inc() return original_dat, conformer, regen_dat, 0
def test_roundtrip(self): """Tests a conversion from a SMU .dat file to protocol buffer and back.""" smu_writer = smu_writer_lib.SmuWriter(annotate=False) for conformer, orig_contents in self.parser.process_stage2(): smu_writer_lib.check_dat_formats_match( orig_contents, smu_writer.process_stage2_proto(conformer).splitlines())
def test_simple(self): parser = smu_parser_lib.SmuParser( os.path.join(TESTDATA_PATH, MAIN_DAT_FILE)) conformer, _ = next(parser.process_stage2()) expected = get_file_contents(os.path.join(TESTDATA_PATH, ATOMIC_INPUT)) writer = smu_writer_lib.AtomicInputWriter() smu_writer_lib.check_dat_formats_match( expected, writer.process(conformer).splitlines())
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') atomic_writer = smu_writer_lib.AtomicInputWriter() file_count = 0 conformer_count = 0 mismatches = 0 for filepath in gfile.glob(FLAGS.input_glob): logging.info('Processing file %s', filepath) file_count += 1 smu_parser = smu_parser_lib.SmuParser(filepath) for conformer, _ in smu_parser.process_stage2(): conformer_count += 1 actual_contents = atomic_writer.process(conformer) expected_fn = atomic_writer.get_filename_for_atomic_input( conformer) with gfile.GFile(os.path.join(FLAGS.atomic_input_dir, expected_fn)) as expected_f: expected_contents = expected_f.readlines() try: smu_writer_lib.check_dat_formats_match( expected_contents, actual_contents.splitlines()) except smu_writer_lib.DatFormatMismatchError as e: mismatches += 1 print(e) if FLAGS.output_dir: with gfile.GFile( os.path.join( FLAGS.output_dir, atomic_writer.get_filename_for_atomic_input( conformer)), 'w') as f: f.write(actual_contents) status_str = ('COMPLETE: Read %d files, %d conformers, %d mismatches\n' % (file_count, conformer_count, mismatches)) logging.info(status_str) print(status_str)
def test_roundtrip_tweaked_bt(self): """Tests a conversion from a SMU .dat file to protocol buffer and back.""" smu_writer = smu_writer_lib.SmuWriter(annotate=False) for molecule, orig_contents in self.parser.process_stage2(): # We're going to mess with the molecule by perturbing the bond_toplogies. # The .dat format shoudl only ever use the starting topology, so we are # going to add some wrong bond topologies to make sure they are ignored. molecule.bond_topologies.append(molecule.bond_topologies[0]) molecule.bond_topologies.append(molecule.bond_topologies[0]) molecule.bond_topologies[ 0].source = dataset_pb2.BondTopology.SOURCE_ITC molecule.bond_topologies[ 1].source = dataset_pb2.BondTopology.SOURCE_CSD for bt in molecule.bond_topologies[0:2]: bt.bonds[0].bond_type = dataset_pb2.BondTopology.BOND_TRIPLE bt.bond_topology_id += 9999 smu_writer_lib.check_dat_formats_match( orig_contents, smu_writer.process_stage2_proto(molecule).splitlines())
def try_roundtrip(self, filename, stage): parser = smu_parser_lib.SmuParser(os.path.join(TESTDATA_PATH, filename)) writer = smu_writer_lib.SmuWriter(annotate=False) if stage == 'stage1': process_fn = parser.process_stage1 writer_fn = writer.process_stage1_proto elif stage == 'stage2': process_fn = parser.process_stage2 writer_fn = writer.process_stage2_proto else: raise ValueError(stage) for maybe_conformer, orig_contents in process_fn(): if isinstance(maybe_conformer, Exception): raise maybe_conformer self.assertGreater(maybe_conformer.bond_topologies[0].bond_topology_id, 0) smu_writer_lib.check_dat_formats_match( orig_contents, writer_fn(maybe_conformer).splitlines())
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') smu_writer = smu_writer_lib.SmuWriter(annotate=False) # output_files maps from Outcome to the a pair of file handle output_files = {} output_files[Outcome.SUCCESS] = (gfile.GFile( FLAGS.output_stem + '_success_original.dat', 'w'), gfile.GFile(FLAGS.output_stem + '_success_regen.dat', 'w')) output_files[Outcome.MISMATCH] = (gfile.GFile( FLAGS.output_stem + '_mismatch_original.dat', 'w'), gfile.GFile(FLAGS.output_stem + '_mismatch_regen.dat', 'w')) output_files[Outcome.PARSE_ERROR_KNOWN] = ( gfile.GFile(FLAGS.output_stem + '_parse_error_known_original.dat', 'w'), gfile.GFile(FLAGS.output_stem + '_parse_error_known_regen.dat', 'w')) output_files[Outcome.PARSE_ERROR_UNKNOWN] = ( gfile.GFile(FLAGS.output_stem + '_parse_error_unknown_original.dat', 'w'), gfile.GFile(FLAGS.output_stem + '_parse_error_unknown_regen.dat', 'w')) file_count = 0 conformer_count = 0 outcome_counts = collections.Counter() for filepath in gfile.glob(FLAGS.input_glob): logging.info('Processing file %s', filepath) file_count += 1 smu_parser = smu_parser_lib.SmuParser(filepath) if FLAGS.stage == 'stage1': process_fn = smu_parser.process_stage1 else: process_fn = smu_parser.process_stage2 for conformer, orig_contents_list in process_fn(): conformer_count += 1 outcome = None if isinstance(conformer, Exception): if isinstance(conformer, smu_parser_lib.SmuKnownError): outcome = Outcome.PARSE_ERROR_KNOWN else: outcome = Outcome.PARSE_ERROR_UNKNOWN regen_contents = '{}\n{}: {} {}\n'.format( smu_parser_lib.SEPARATOR_LINE, conformer.conformer_id, type(conformer).__name__, str(conformer)) else: if FLAGS.stage == 'stage1': regen_contents = smu_writer.process_stage1_proto(conformer) else: regen_contents = smu_writer.process_stage2_proto(conformer) try: smu_writer_lib.check_dat_formats_match( orig_contents_list, regen_contents.splitlines()) outcome = Outcome.SUCCESS except smu_writer_lib.DatFormatMismatchError as e: outcome = Outcome.MISMATCH print(e) outcome_counts[outcome] += 1 output_files[outcome][0].write('\n'.join(orig_contents_list) + '\n') output_files[outcome][1].write(regen_contents) for file_orig, file_regen in output_files.values(): file_orig.close() file_regen.close() def outcome_status(outcome): if conformer_count: percent = outcome_counts[outcome] / conformer_count * 100 else: percent = float('nan') return '%5.1f%% %7d %s \n' % (percent, outcome_counts[outcome], str(outcome)) status_str = ('COMPLETE: Read %d files, %d conformers\n' % (file_count, conformer_count) + outcome_status(Outcome.SUCCESS) + outcome_status(Outcome.PARSE_ERROR_KNOWN) + outcome_status(Outcome.MISMATCH) + outcome_status(Outcome.PARSE_ERROR_UNKNOWN)) logging.info(status_str) print(status_str)