def bond_topology_summaries_from_csv(filename): """Beam DoFn for generating bare BondTopologySummary. Args: filename: csv file of bond topologies to read Yields: dataset_pb2.Entry """ for bt in smu_utils_lib.generate_bond_topologies_from_csv(filename): summary = dataset_pb2.BondTopologySummary() summary.bond_topology.CopyFrom(bt) # Note that we leave all the counts as 0. yield bt.bond_topology_id, summary
def pipeline(root): """Beam pipeline. Args: root: the root of the pipeline. """ _ = ( root | 'CreateTopologies' >> beam.Create( smu_utils_lib.generate_bond_topologies_from_csv( FLAGS.input_bond_topology_csv)) | 'Reshuffle1' >> beam.Reshuffle() | 'CheckInvariance' >> beam.FlatMap(check_smiles_permutation_invariance) | 'Reshuffle2' >> beam.Reshuffle() | 'CSVFormat' >> beam.Map(lambda vals: ','.join(str(x) for x in vals)) | 'WriteOutput' >> beam.io.WriteToText( FLAGS.output_csv, header='bt_id,smiles0,smiles1', num_shards=1))
def test_basic(self): infile = tempfile.NamedTemporaryFile(mode='w', delete=False) infile.write( 'id,num_atoms,atoms_str,connectivity_matrix,hydrogens,smiles\n') infile.write('68,3,C N+O-,310,010,[NH+]#C[O-]\n') infile.write('134,4,N+O-F F ,111000,1000,[O-][NH+](F)F\n') infile.close() out = smu_utils_lib.generate_bond_topologies_from_csv(infile.name) bt = next(out) self.assertEqual(68, bt.bond_topology_id) self.assertLen(bt.atoms, 4) self.assertEqual(bt.smiles, '[NH+]#C[O-]') bt = next(out) self.assertEqual(134, bt.bond_topology_id) self.assertLen(bt.atoms, 5) self.assertEqual(bt.smiles, '[O-][NH+](F)F')