# get all the records from the SCOP classification file
# this now has (pdbid,chain) tuples
logger.debug('Getting records to test from SCOP Classification file...')
hierarchy = hierarchy_sets_from_file(SCOP_CLASSIFICATION_FILE, target_level,
                                     target_sunid)
logger.debug('hierarchy: ' + str(hierarchy))

# create the whitelist of PDB chains on which to train explicitly
#logger.debug('Getting the whitelist of chains to test...')
#whitelist = whitelist_from_file(WHITELIST_FILENAME)

# end the program if the output dir doesn't exist
logger.debug('Checking whether output directory exists...')
if not os.path.isdir(output_dir):
    logger.critical('Output directory ' + output_dir + ' does not yet exist')
    logger.critical('Please run generate-matt-alignments.py and ' + \
                     'generate-hmm.py first')
    sys.exit(2)

# iterate over each hierarchy level to query
logger.debug('Iterating over query families ' + str(hierarchy))

used_pdbids = set()

# FIXME pdbids must be replaced with a set of (pdbid,chain) tuples
for sunid, pdbid_tuples in hierarchy.iteritems():

    # create a directory for the positive control queries, if it doesn't exist
    logger.debug('Checking whether directory for positive controls exists...')
    aligner_output_dir = os.path.join(output_dir, str(sunid), aligner)
                      
logger.debug('  number of trained chains: ' + str(len(trained_pdbids)))
query_pdbids -= trained_pdbids

logger.debug('  number of query chains: ' + str(len(query_pdbids)))



# create the whitelist of PDB chains on which to train explicitly
#logger.debug('Getting the whitelist of chains to test...')
#whitelist = whitelist_from_file(WHITELIST_FILENAME)

# end the program if the output dir doesn't exist
logger.debug('Checking whether output directory exists...')
if not os.path.isdir(output_dir):
    logger.critical('Output directory ' + output_dir + ' does not yet exist')
    logger.critical('Please run generate-matt-alignments.py, ' + \
                     'generate-hmm.py, and generate-positive-controls.py ' + \
                     'first')
    sys.exit(2)

# determine which hierarchy levels exist
logger.debug('Determining which hierarchy levels were left out during '
             'training...')
logger.debug('  output_dir contains: ' + str(os.listdir(output_dir)))
sunids = filter(lambda x: os.path.isdir(os.path.join(output_dir, x)),
                os.listdir(output_dir))
logger.debug('  sunids: ' + str(sunids))

# iterate over each family
# do NOT iterate over each family for now, just the first
                      
logger.debug('  number of trained chains: ' + str(len(trained_pdbids)))
query_pdbids -= trained_pdbids

logger.debug('  number of query chains: ' + str(len(query_pdbids)))



# create the whitelist of PDB chains on which to train explicitly
#logger.debug('Getting the whitelist of chains to test...')
#whitelist = whitelist_from_file(WHITELIST_FILENAME)

# end the program if the output dir doesn't exist
logger.debug('Checking whether output directory exists...')
if not os.path.isdir(output_dir):
    logger.critical('Output directory ' + output_dir + ' does not yet exist')
    logger.critical('Please run generate-matt-alignments.py, ' + \
                     'generate-hmm.py, and generate-positive-controls.py ' + \
                     'first')
    sys.exit(2)

# determine which hierarchy levels exist
logger.debug('Determining which hierarchy levels were left out during '
             'training...')
logger.debug('  output_dir contains: ' + str(os.listdir(output_dir)))
sunids = filter(lambda x: os.path.isdir(os.path.join(output_dir, x)),
                os.listdir(output_dir))
logger.debug('  sunids: ' + str(sunids))

# iterate over each family
# do NOT iterate over each family for now, just the first
          'target_level' : target_level,
          'target_sunid' : target_sunid}
logger.debug('Program configuration: ' + str(config))

# get the set of non-redundant PDB chains from the NRPDB file, and use only
# these chains for training

nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field)

# get all the records from the SCOP classification file
logger.debug('Getting PDB IDs from SCOP Classification file...')
all_pdbs = all_pdbids_from_file_in(SCOP_CLASSIFICATION_FILE, target_level,
                                     target_sunid)
logger.debug('all_pdbs: ' + str(all_pdbs))
if len(all_pdbs) == 0:
    logger.critical('Nothing in all_pdbs for target level: ' + str(target_level) + ' and target_sunid: ' + str(target_sunid))
    sys.exit(1)

# create the whitelist of PDB chains on which to train explicitly
#logger.debug('Getting the whitelist of chains to test...')
#whitelist = whitelist_from_file(WHITELIST_FILENAME)

# create the output directory if it doesn't exist
logger.debug('Checking whether output directory exists...')
if not os.path.isdir(output_dir):
    logger.debug('...it doesn\'t so we create it')
    os.mkdir(output_dir)

# write a README file to the output directory
logger.debug('Writing a top-level README...')
write_general_readme(os.path.join(output_dir, 'README'), config)
          'target_level' : target_level,
          'target_sunid' : target_sunid}
logger.debug('Program configuration: ' + str(config))

# get the set of non-redundant PDB chains from the NRPDB file, and use only
# these chains for training

nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field)

# get all the records from the SCOP classification file
logger.debug('Getting PDB IDs from SCOP Classification file...')
hierarchy = hierarchy_sets_from_file(SCOP_CLASSIFICATION_FILE, target_level,
                                     target_sunid)
logger.debug('hierarchy: ' + str(hierarchy))
if len(hierarchy) == 0:
    logger.critical('Nothing in hierarchy for target level: ' + str(target_level) + ' and target_sunid: ' + str(target_sunid))
    sys.exit(1)

# create the whitelist of PDB chains on which to train explicitly
#logger.debug('Getting the whitelist of chains to test...')
#whitelist = whitelist_from_file(WHITELIST_FILENAME)

# create the output directory if it doesn't exist
logger.debug('Checking whether output directory exists...')
if not os.path.isdir(output_dir):
    logger.debug('...it doesn\'t so we create it')
    os.mkdir(output_dir)

# write a README file to the output directory
logger.debug('Writing a top-level README...')
write_general_readme(os.path.join(output_dir, 'README'), config)
# get all the records from the SCOP classification file
# this now has (pdbid,chain) tuples
logger.debug('Getting records to test from SCOP Classification file...')
hierarchy = hierarchy_sets_from_file(SCOP_CLASSIFICATION_FILE, target_level,
                                     target_sunid)
logger.debug('hierarchy: ' + str(hierarchy))

# create the whitelist of PDB chains on which to train explicitly
#logger.debug('Getting the whitelist of chains to test...')
#whitelist = whitelist_from_file(WHITELIST_FILENAME)

# end the program if the output dir doesn't exist
logger.debug('Checking whether output directory exists...')
if not os.path.isdir(output_dir):
    logger.critical('Output directory ' + output_dir + ' does not yet exist')
    logger.critical('Please run generate-training-targets.py first')
    sys.exit(2)

# iterate over each hierarchy level to query
logger.debug('Iterating over query families ' + str(hierarchy))

used_pdbids = set()
pdb_names = {}

# FIXME pdbids must be replaced with a set of (pdbid,chain) tuples
for sunid, pdbid_tuples in hierarchy.iteritems():

    # create a directory for the positive control queries, if it doesn't exist
    logger.debug('Checking whether directory for positive controls exists...')
    level_output_dir = os.path.join(output_dir, str(sunid))
Пример #7
0
output_dir = parsed_args.outputdir
aligner = parsed_args.aligner
# logger.debug('Aligner: ' + aligner)
# determine the amount of logging info to output
if parsed_args.verbose:
    from logging import DEBUG
    from gargamel.logger import console_handler
    console_handler.setLevel(DEBUG)

# a summary of the runtime configuration of this program
config = {'output_dir' : output_dir}
logger.debug('Program configuration: ' + str(config))

# check if the output dir exists
if not os.path.isdir(output_dir):
    logger.critical('Directory ' + output_dir + ' does not exist.')
    sys.exit(STATUS_NO_DIR)

# find all subdirectories of the output directory
logger.debug('Determining which family directories exist...')
logger.debug('  output_dir contains: ' + str(os.listdir(output_dir)))
subdirectories = filter(lambda x: os.path.isdir(os.path.join(output_dir, x)),
                        os.listdir(output_dir))
logger.debug('  subdirectories: ' + str(subdirectories))
# get all subdirectories which are only digits
# TODO this is not the best way to do this
families = filter(lambda x: all(filter(lambda y: y.isdigit(), x)),
                  subdirectories)
logger.debug('  families: '  + str(families))

# touch the CSV file so that it is empty