# get all the records from the SCOP classification file # this now has (pdbid,chain) tuples logger.debug('Getting records to test from SCOP Classification file...') hierarchy = hierarchy_sets_from_file(SCOP_CLASSIFICATION_FILE, target_level, target_sunid) logger.debug('hierarchy: ' + str(hierarchy)) # create the whitelist of PDB chains on which to train explicitly #logger.debug('Getting the whitelist of chains to test...') #whitelist = whitelist_from_file(WHITELIST_FILENAME) # end the program if the output dir doesn't exist logger.debug('Checking whether output directory exists...') if not os.path.isdir(output_dir): logger.critical('Output directory ' + output_dir + ' does not yet exist') logger.critical('Please run generate-matt-alignments.py and ' + \ 'generate-hmm.py first') sys.exit(2) # iterate over each hierarchy level to query logger.debug('Iterating over query families ' + str(hierarchy)) used_pdbids = set() # FIXME pdbids must be replaced with a set of (pdbid,chain) tuples for sunid, pdbid_tuples in hierarchy.iteritems(): # create a directory for the positive control queries, if it doesn't exist logger.debug('Checking whether directory for positive controls exists...') aligner_output_dir = os.path.join(output_dir, str(sunid), aligner)
logger.debug(' number of trained chains: ' + str(len(trained_pdbids))) query_pdbids -= trained_pdbids logger.debug(' number of query chains: ' + str(len(query_pdbids))) # create the whitelist of PDB chains on which to train explicitly #logger.debug('Getting the whitelist of chains to test...') #whitelist = whitelist_from_file(WHITELIST_FILENAME) # end the program if the output dir doesn't exist logger.debug('Checking whether output directory exists...') if not os.path.isdir(output_dir): logger.critical('Output directory ' + output_dir + ' does not yet exist') logger.critical('Please run generate-matt-alignments.py, ' + \ 'generate-hmm.py, and generate-positive-controls.py ' + \ 'first') sys.exit(2) # determine which hierarchy levels exist logger.debug('Determining which hierarchy levels were left out during ' 'training...') logger.debug(' output_dir contains: ' + str(os.listdir(output_dir))) sunids = filter(lambda x: os.path.isdir(os.path.join(output_dir, x)), os.listdir(output_dir)) logger.debug(' sunids: ' + str(sunids)) # iterate over each family # do NOT iterate over each family for now, just the first
'target_level' : target_level, 'target_sunid' : target_sunid} logger.debug('Program configuration: ' + str(config)) # get the set of non-redundant PDB chains from the NRPDB file, and use only # these chains for training nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field) # get all the records from the SCOP classification file logger.debug('Getting PDB IDs from SCOP Classification file...') all_pdbs = all_pdbids_from_file_in(SCOP_CLASSIFICATION_FILE, target_level, target_sunid) logger.debug('all_pdbs: ' + str(all_pdbs)) if len(all_pdbs) == 0: logger.critical('Nothing in all_pdbs for target level: ' + str(target_level) + ' and target_sunid: ' + str(target_sunid)) sys.exit(1) # create the whitelist of PDB chains on which to train explicitly #logger.debug('Getting the whitelist of chains to test...') #whitelist = whitelist_from_file(WHITELIST_FILENAME) # create the output directory if it doesn't exist logger.debug('Checking whether output directory exists...') if not os.path.isdir(output_dir): logger.debug('...it doesn\'t so we create it') os.mkdir(output_dir) # write a README file to the output directory logger.debug('Writing a top-level README...') write_general_readme(os.path.join(output_dir, 'README'), config)
'target_level' : target_level, 'target_sunid' : target_sunid} logger.debug('Program configuration: ' + str(config)) # get the set of non-redundant PDB chains from the NRPDB file, and use only # these chains for training nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field) # get all the records from the SCOP classification file logger.debug('Getting PDB IDs from SCOP Classification file...') hierarchy = hierarchy_sets_from_file(SCOP_CLASSIFICATION_FILE, target_level, target_sunid) logger.debug('hierarchy: ' + str(hierarchy)) if len(hierarchy) == 0: logger.critical('Nothing in hierarchy for target level: ' + str(target_level) + ' and target_sunid: ' + str(target_sunid)) sys.exit(1) # create the whitelist of PDB chains on which to train explicitly #logger.debug('Getting the whitelist of chains to test...') #whitelist = whitelist_from_file(WHITELIST_FILENAME) # create the output directory if it doesn't exist logger.debug('Checking whether output directory exists...') if not os.path.isdir(output_dir): logger.debug('...it doesn\'t so we create it') os.mkdir(output_dir) # write a README file to the output directory logger.debug('Writing a top-level README...') write_general_readme(os.path.join(output_dir, 'README'), config)
# get all the records from the SCOP classification file # this now has (pdbid,chain) tuples logger.debug('Getting records to test from SCOP Classification file...') hierarchy = hierarchy_sets_from_file(SCOP_CLASSIFICATION_FILE, target_level, target_sunid) logger.debug('hierarchy: ' + str(hierarchy)) # create the whitelist of PDB chains on which to train explicitly #logger.debug('Getting the whitelist of chains to test...') #whitelist = whitelist_from_file(WHITELIST_FILENAME) # end the program if the output dir doesn't exist logger.debug('Checking whether output directory exists...') if not os.path.isdir(output_dir): logger.critical('Output directory ' + output_dir + ' does not yet exist') logger.critical('Please run generate-training-targets.py first') sys.exit(2) # iterate over each hierarchy level to query logger.debug('Iterating over query families ' + str(hierarchy)) used_pdbids = set() pdb_names = {} # FIXME pdbids must be replaced with a set of (pdbid,chain) tuples for sunid, pdbid_tuples in hierarchy.iteritems(): # create a directory for the positive control queries, if it doesn't exist logger.debug('Checking whether directory for positive controls exists...') level_output_dir = os.path.join(output_dir, str(sunid))
output_dir = parsed_args.outputdir aligner = parsed_args.aligner # logger.debug('Aligner: ' + aligner) # determine the amount of logging info to output if parsed_args.verbose: from logging import DEBUG from gargamel.logger import console_handler console_handler.setLevel(DEBUG) # a summary of the runtime configuration of this program config = {'output_dir' : output_dir} logger.debug('Program configuration: ' + str(config)) # check if the output dir exists if not os.path.isdir(output_dir): logger.critical('Directory ' + output_dir + ' does not exist.') sys.exit(STATUS_NO_DIR) # find all subdirectories of the output directory logger.debug('Determining which family directories exist...') logger.debug(' output_dir contains: ' + str(os.listdir(output_dir))) subdirectories = filter(lambda x: os.path.isdir(os.path.join(output_dir, x)), os.listdir(output_dir)) logger.debug(' subdirectories: ' + str(subdirectories)) # get all subdirectories which are only digits # TODO this is not the best way to do this families = filter(lambda x: all(filter(lambda y: y.isdigit(), x)), subdirectories) logger.debug(' families: ' + str(families)) # touch the CSV file so that it is empty