Exemplo n.º 1
0
def process():
    if len(sys.argv) < 4:
        print('Usage:',
              sys.argv[0],
              '[input.sdf] [exclude-molecules.sdf] [output.sdf]',
              file=sys.stderr)
        sys.exit(2)

    ifs = Base.FileIOStream(sys.argv[1], 'r')
    xifs = Base.FileIOStream(sys.argv[2], 'r')
    ofs = Base.FileIOStream(sys.argv[3], 'w')

    reader = Chem.SDFMoleculeReader(ifs)
    xreader = Chem.SDFMoleculeReader(xifs)
    writer = Chem.SDFMolecularGraphWriter(ofs)
    mol = Chem.BasicMolecule()

    Chem.setMultiConfImportParameter(reader, False)
    Chem.setMultiConfImportParameter(xreader, False)
    Chem.setMultiConfExportParameter(writer, False)

    stats = Stats()
    stats.read = 0
    stats.dropped = 0

    xhashes = set()

    while xreader.read(mol):
        setupMolecule(mol)

        hashcode = Chem.calcHashCode(mol)
        xhashes.add(hashcode)

    while reader.read(mol):
        #print('Processing Molecule ' + str(stats.read)
        setupMolecule(mol)

        hashcode = Chem.calcHashCode(mol)

        if hashcode in xhashes:
            stats.dropped += 1
            print('Dropped Molecule ' + str(stats.read) + ': ' +
                  Chem.generateSMILES(mol) + ' ' + Chem.getName(mol),
                  file=sys.stderr)
        else:
            writer.write(mol)

        stats.read += 1

        if stats.read % 10000 == 0:
            print('Processed ' + str(stats.read) + ' Molecules...',
                  file=sys.stderr)

    print('', file=sys.stderr)
    print('-- Summary --', file=sys.stderr)
    print('Molecules processed: ' + str(stats.read), file=sys.stderr)
    print('Molecules dropped: ' + str(stats.dropped), file=sys.stderr)
Exemplo n.º 2
0
def cleanStructures():
    if len(sys.argv) < 5:
        print('Usage:',
              sys.argv[0],
              '[input.sdf] [output.sdf] [dropped.sdf] [start_index] [[count]]',
              file=sys.stderr)
        sys.exit(2)

    ifs = Base.FileIOStream(sys.argv[1], 'r')
    ofs = Base.FileIOStream(sys.argv[2], 'w')
    dofs = Base.FileIOStream(sys.argv[3], 'w')
    offset = int(sys.argv[4])
    count = 0

    if len(sys.argv) > 5:
        count = int(sys.argv[5])

    reader = Chem.SDFMoleculeReader(ifs)
    writer = Chem.SDFMolecularGraphWriter(ofs)
    dwriter = Chem.SDFMolecularGraphWriter(dofs)
    mol = Chem.BasicMolecule()

    #Chem.setSMILESRecordFormatParameter(reader, 'SN')

    stats = Stats()
    stats.read = 0
    stats.dropped = 0
    stats.modified = 0

    Chem.setMultiConfImportParameter(reader, False)
    Chem.setMultiConfExportParameter(writer, False)
    Chem.setMultiConfExportParameter(dwriter, False)

    if offset > 0:
        print('Skipping Molecules to Start Index ' + str(offset),
              file=sys.stderr)
        reader.setRecordIndex(offset)
        #print('Finished Setting Record Index', file=sys.stderr)

    stats.read = offset

    while reader.read(mol):
        #print('Processing Molecule ' + str(stats.read)
        proc_mol = processMolecule(mol, stats)

        if proc_mol:
            writer.write(proc_mol)
        else:
            stats.dropped += 1
            dwriter.write(mol)
            print('Dropped Molecule ' + str(stats.read) + ': ' +
                  generateSMILES(mol) + ' ' + Chem.getName(mol),
                  file=sys.stderr)

        stats.read += 1

        if stats.read % 10000 == 0:
            print('Processed ' + str(stats.read - offset) + ' Molecules...',
                  file=sys.stderr)

        if count > 0 and (stats.read - offset) >= count:
            break

    print('', file=sys.stderr)
    print('-- Summary --', file=sys.stderr)
    print('Molecules processed: ' + str(stats.read - offset), file=sys.stderr)
    print('Molecules dropped: ' + str(stats.dropped), file=sys.stderr)
    print('Molecules modified: ' + str(stats.modified), file=sys.stderr)