Пример #1
0
def regenerate_dat(input_tuple, stage):
    """Regenerates the original dat from conformer and compares it to original.

  Args:
    input_tuple: tuple of string (original contents), dataset_pb2.Conformer
    stage: string 'stage1' or 'stage2'

  Returns:
    original_dat, conformer, regenerated dat, int (0=mismatch, 1=match)
  """
    original_dat, conformer = input_tuple
    smu_writer = smu_writer_lib.SmuWriter(annotate=False)
    if stage == 'stage1':
        regen_dat = smu_writer.process_stage1_proto(conformer)
    else:
        regen_dat = smu_writer.process_stage2_proto(conformer)
    try:
        smu_writer_lib.check_dat_formats_match(original_dat.splitlines(),
                                               regen_dat.splitlines())
        beam.metrics.Metrics.counter(_METRICS_NAMESPACE,
                                     stage + '_dat_format_matched').inc()
        return original_dat, conformer, regen_dat, 1
    except smu_writer_lib.DatFormatMismatchError:
        beam.metrics.Metrics.counter(_METRICS_NAMESPACE,
                                     stage + '_dat_format_mismatched').inc()
        return original_dat, conformer, regen_dat, 0
Пример #2
0
 def test_roundtrip(self):
     """Tests a conversion from a SMU .dat file to protocol buffer and back."""
     smu_writer = smu_writer_lib.SmuWriter(annotate=False)
     for conformer, orig_contents in self.parser.process_stage2():
         smu_writer_lib.check_dat_formats_match(
             orig_contents,
             smu_writer.process_stage2_proto(conformer).splitlines())
Пример #3
0
    def __init__(self, output_path):
        """Creates DatOutputter.

    Args:
      output_path: file to write to
    """
        self.writer = smu_writer_lib.SmuWriter(annotate=False)
        if output_path:
            self.outfile = open(output_path, 'w')
        else:
            self.outfile = sys.stdout
Пример #4
0
 def test_roundtrip_tweaked_bt(self):
     """Tests a conversion from a SMU .dat file to protocol buffer and back."""
     smu_writer = smu_writer_lib.SmuWriter(annotate=False)
     for molecule, orig_contents in self.parser.process_stage2():
         # We're going to mess with the molecule by perturbing the bond_toplogies.
         # The .dat format shoudl only ever use the starting topology, so we are
         # going to add some wrong bond topologies to make sure they are ignored.
         molecule.bond_topologies.append(molecule.bond_topologies[0])
         molecule.bond_topologies.append(molecule.bond_topologies[0])
         molecule.bond_topologies[
             0].source = dataset_pb2.BondTopology.SOURCE_ITC
         molecule.bond_topologies[
             1].source = dataset_pb2.BondTopology.SOURCE_CSD
         for bt in molecule.bond_topologies[0:2]:
             bt.bonds[0].bond_type = dataset_pb2.BondTopology.BOND_TRIPLE
             bt.bond_topology_id += 9999
         smu_writer_lib.check_dat_formats_match(
             orig_contents,
             smu_writer.process_stage2_proto(molecule).splitlines())
Пример #5
0
  def try_roundtrip(self, filename, stage):
    parser = smu_parser_lib.SmuParser(os.path.join(TESTDATA_PATH, filename))
    writer = smu_writer_lib.SmuWriter(annotate=False)

    if stage == 'stage1':
      process_fn = parser.process_stage1
      writer_fn = writer.process_stage1_proto
    elif stage == 'stage2':
      process_fn = parser.process_stage2
      writer_fn = writer.process_stage2_proto
    else:
      raise ValueError(stage)

    for maybe_conformer, orig_contents in process_fn():
      if isinstance(maybe_conformer, Exception):
        raise maybe_conformer
      self.assertGreater(maybe_conformer.bond_topologies[0].bond_topology_id, 0)
      smu_writer_lib.check_dat_formats_match(
          orig_contents,
          writer_fn(maybe_conformer).splitlines())
Пример #6
0
    def test_pbtxt_to_annotated_dat(self, input_fn, expected_fn):
        # Note that this is partially a copy and paste from smu_writer (which is
        # what is used to regenerate the golden)
        full_input_fn = os.path.join(TESTDATA_PATH, input_fn)
        full_expected_fn = os.path.join(TESTDATA_PATH, expected_fn)

        smu_proto = dataset_pb2.MultipleConformers()
        raw_proto = '\n'.join(get_file_contents(full_input_fn))
        text_format.Parse(raw_proto, smu_proto)
        smu_writer = smu_writer_lib.SmuWriter(True)
        got = ''.join(
            smu_writer.process_stage2_proto(conformer)
            for conformer in smu_proto.conformers)

        expected = get_file_contents(full_expected_fn)

        print('Command line to regenerate:\npython3 parser/smu_writer.py '
              '--input_file {} --output_file {} --annotate True'.format(
                  full_input_fn, full_expected_fn))

        self.assertEqual([l.rstrip('\n') for l in expected], got.splitlines())
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    smu_writer = smu_writer_lib.SmuWriter(annotate=False)

    # output_files maps from Outcome to the a pair of file handle
    output_files = {}
    output_files[Outcome.SUCCESS] = (gfile.GFile(
        FLAGS.output_stem + '_success_original.dat',
        'w'), gfile.GFile(FLAGS.output_stem + '_success_regen.dat', 'w'))
    output_files[Outcome.MISMATCH] = (gfile.GFile(
        FLAGS.output_stem + '_mismatch_original.dat',
        'w'), gfile.GFile(FLAGS.output_stem + '_mismatch_regen.dat', 'w'))
    output_files[Outcome.PARSE_ERROR_KNOWN] = (
        gfile.GFile(FLAGS.output_stem + '_parse_error_known_original.dat',
                    'w'),
        gfile.GFile(FLAGS.output_stem + '_parse_error_known_regen.dat', 'w'))
    output_files[Outcome.PARSE_ERROR_UNKNOWN] = (
        gfile.GFile(FLAGS.output_stem + '_parse_error_unknown_original.dat',
                    'w'),
        gfile.GFile(FLAGS.output_stem + '_parse_error_unknown_regen.dat', 'w'))

    file_count = 0
    conformer_count = 0
    outcome_counts = collections.Counter()

    for filepath in gfile.glob(FLAGS.input_glob):
        logging.info('Processing file %s', filepath)
        file_count += 1
        smu_parser = smu_parser_lib.SmuParser(filepath)
        if FLAGS.stage == 'stage1':
            process_fn = smu_parser.process_stage1
        else:
            process_fn = smu_parser.process_stage2
        for conformer, orig_contents_list in process_fn():
            conformer_count += 1

            outcome = None

            if isinstance(conformer, Exception):
                if isinstance(conformer, smu_parser_lib.SmuKnownError):
                    outcome = Outcome.PARSE_ERROR_KNOWN
                else:
                    outcome = Outcome.PARSE_ERROR_UNKNOWN
                regen_contents = '{}\n{}: {} {}\n'.format(
                    smu_parser_lib.SEPARATOR_LINE, conformer.conformer_id,
                    type(conformer).__name__, str(conformer))
            else:
                if FLAGS.stage == 'stage1':
                    regen_contents = smu_writer.process_stage1_proto(conformer)
                else:
                    regen_contents = smu_writer.process_stage2_proto(conformer)
                try:
                    smu_writer_lib.check_dat_formats_match(
                        orig_contents_list, regen_contents.splitlines())
                    outcome = Outcome.SUCCESS
                except smu_writer_lib.DatFormatMismatchError as e:
                    outcome = Outcome.MISMATCH
                    print(e)

            outcome_counts[outcome] += 1
            output_files[outcome][0].write('\n'.join(orig_contents_list) +
                                           '\n')
            output_files[outcome][1].write(regen_contents)

    for file_orig, file_regen in output_files.values():
        file_orig.close()
        file_regen.close()

    def outcome_status(outcome):
        if conformer_count:
            percent = outcome_counts[outcome] / conformer_count * 100
        else:
            percent = float('nan')
        return '%5.1f%% %7d %s \n' % (percent, outcome_counts[outcome],
                                      str(outcome))

    status_str = ('COMPLETE: Read %d files, %d conformers\n' %
                  (file_count, conformer_count) +
                  outcome_status(Outcome.SUCCESS) +
                  outcome_status(Outcome.PARSE_ERROR_KNOWN) +
                  outcome_status(Outcome.MISMATCH) +
                  outcome_status(Outcome.PARSE_ERROR_UNKNOWN))

    logging.info(status_str)
    print(status_str)