def parse_hmmer_output(filename): logger.debug('parsing from file: ' + filename) result = result_from_file(filename) if result == None: return 'Unknown', 'Unknown', None pdbid, chainid = result.pdbid, result.chain # logger.debug('result: ' + str(result)) rawscore = float(result.full_seq_score.score) return pdbid, chainid, rawscore
def parse_smurf_output(filename): logger.debug('parsing from file: ' + filename) with open(filename, 'r') as f: pdb_str = f.readline()[1:7] # logger.debug('read: ' + pdb_str) if len(pdb_str) < 6: return None, None, None pdbid, chainid = pdb_str.split('_') read_value = f.readline() if read_value[0:8] == 'Sequence': return pdbid, chainid, None rawscore = float(read_value[11:]) # logger.debug('read: ' + str(rawscore)) return pdbid, chainid, rawscore
# determine the amount of logging info to output if parsed_args.verbose: from logging import DEBUG from gargamel.logger import console_handler console_handler.setLevel(DEBUG) # configuration summary config = {'target_level' : target_level, 'target_sunid' : target_sunid, 'output_dir' : output_dir, #'logging_level' : level, 'aligner' : aligner, 'representative_field' : representative_field} logger.debug('Program configuration: ' + str(config)) # create a dictionary from PDB IDs to residue sequences logger.debug('Building a mapping from PDBID to residue sequence...') sequences = sequences_from_file(FASTA_FILENAME) # get the set of non-redundant PDB chains from the NRPDB file, and use only # these chains for training logger.debug('Getting non-redundant set of PDB chains...') nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field) logger.debug('There are ' + str(len(nrpdbs)) + ' non-redundant chains.') # then, filter the query_pdbids and the trained_pdbids to be only reps # then subtract out the trained_pdbids # get all the PDB IDs on which to test (don't test already trained superfamily)
representative_field = parsed_args.repfield # determine the amount of logging info to output if parsed_args.verbose: from logging import DEBUG from gargamel.logger import console_handler console_handler.setLevel(DEBUG) # configuration summary config = {'target_level' : target_level, 'target_sunid' : target_sunid, 'output_dir' : output_dir, #'logging_level' : level, 'aligner' : aligner, 'representative_field' : representative_field} logger.debug('Program configuration: ' + str(config)) # create a dictionary from PDB IDs to residue sequences logger.debug('Building a mapping from PDBID to residue sequence...') sequences = sequences_from_file(FASTA_FILENAME) # get the set of non-redundant PDB chains from the NRPDB file logger.debug('Getting non-redundant set of PDB chains...') nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field) # logger.debug('nrpdbs: ' + str(nrpdbs)) # get all the records from the SCOP classification file # this now has (pdbid,chain) tuples
output_filename = 'negative_controls' # determine the amount of logging info to output if parsed_args.verbose: from logging import DEBUG from gargamel.logger import console_handler console_handler.setLevel(DEBUG) # configuration summary config = {'target_level' : target_level, 'target_sunid' : target_sunid, 'output_dir' : output_dir, #'logging_level' : level, 'representative_field' : representative_field} logger.debug('Program configuration: ' + str(config)) # get the set of non-redundant PDB chains from the NRPDB file, and use only # these chains for training logger.debug('Getting non-redundant set of PDB chains...') nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field) logger.debug('There are ' + str(len(nrpdbs)) + ' non-redundant chains.') # then, filter the query_pdbids and the trained_pdbids to be only reps # then subtract out the trained_pdbids # get all the PDB IDs on which to test (don't test already trained superfamily) logger.debug('Getting records to test from SCOP Classification file...') original_query_pdbids = all_pdbids_from_file(SCOP_CLASSIFICATION_FILE) query_pdbids = set([nrpdbs[x] for x in original_query_pdbids if x in nrpdbs])
output_dir = parsed_args.outputdir.rstrip('/') # remove trailing slash query_file = parsed_args.query_file # determine the amount of logging info to output if parsed_args.verbose: from logging import DEBUG from gargamel.logger import console_handler console_handler.setLevel(DEBUG) # configuration summary config = {'output_dir' : output_dir, #'logging_level' : level, 'aligner' : aligner, 'query_file': query_file} logger.debug('Program configuration: ' + str(config)) # end the program if the output dir doesn't exist logger.debug('Checking whether output directory exists...') if not os.path.isdir(output_dir): logger.critical('Output directory ' + output_dir + ' does not yet exist') logger.critical('Please run generate-matt-alignments.py and ' + \ 'generate-hmm.py first') sys.exit(2) # TODO get base fasta filename, put this in a loop over all queries base_fasta_filename = os.path.basename(query_file)
pdb_dir = parsed_args.pdbdir.rstrip('/') # remove trailing slash representative_field = parsed_args.repfield # determine the amount of logging info to output if parsed_args.verbose: from logging import DEBUG from gargamel.logger import console_handler console_handler.setLevel(DEBUG) # summary of the program configuration config = {'output_dir' : output_dir, 'pdb_dir' : pdb_dir, 'representative_field' : representative_field, 'target_level' : target_level, 'target_sunid' : target_sunid} logger.debug('Program configuration: ' + str(config)) # get the set of non-redundant PDB chains from the NRPDB file, and use only # these chains for training nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field) # get all the records from the SCOP classification file logger.debug('Getting PDB IDs from SCOP Classification file...') all_pdbs = all_pdbids_from_file_in(SCOP_CLASSIFICATION_FILE, target_level, target_sunid) logger.debug('all_pdbs: ' + str(all_pdbs)) if len(all_pdbs) == 0: logger.critical('Nothing in all_pdbs for target level: ' + str(target_level) + ' and target_sunid: ' + str(target_sunid)) sys.exit(1)
pdb_dir = parsed_args.pdbdir.rstrip('/') # remove trailing slash representative_field = parsed_args.repfield # determine the amount of logging info to output if parsed_args.verbose: from logging import DEBUG from gargamel.logger import console_handler console_handler.setLevel(DEBUG) # summary of the program configuration config = {'output_dir' : output_dir, 'pdb_dir' : pdb_dir, 'representative_field' : representative_field, 'target_level' : target_level, 'target_sunid' : target_sunid} logger.debug('Program configuration: ' + str(config)) # get the set of non-redundant PDB chains from the NRPDB file, and use only # these chains for training nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field) # get all the records from the SCOP classification file logger.debug('Getting PDB IDs from SCOP Classification file...') hierarchy = hierarchy_sets_from_file(SCOP_CLASSIFICATION_FILE, target_level, target_sunid) logger.debug('hierarchy: ' + str(hierarchy)) if len(hierarchy) == 0: logger.critical('Nothing in hierarchy for target level: ' + str(target_level) + ' and target_sunid: ' + str(target_sunid)) sys.exit(1)
output_dir = parsed_args.outputdir.rstrip('/') # remove trailing slash aligner = parsed_args.aligner smurf_lite_threshold = parsed_args.smurf_lite_threshold simev_frequency = parsed_args.simev_frequency simev_count = parsed_args.simev_count simev_threshold = parsed_args.simev_threshold # determine the amount of logging info to output if parsed_args.verbose: from logging import DEBUG from gargamel.logger import console_handler console_handler.setLevel(DEBUG) # summary of the program configuration config = {'output_dir' : output_dir, 'aligner' : aligner} logger.debug('Program configuration: ' + str(config)) # end the program if the output dir doesn't exist logger.debug('Checking whether output directory exists...') if not os.path.isdir(output_dir): logger.critical('Output directory ' + output_dir + ' does not yet exist') logger.critical('Please run generate-matt-alignments.py first') sys.exit(2) # determine which executable and multiple alignment file to use for the # hmmbuild step, and determine the name of the HMM file if aligner == SMURF_LITE: executable = SMURF_LITE_HMMBUILD_EXECUTABLE preparse_executable = SMURF_LITE_PREPARSE_EXECUTABLE hmm_filename = os.path.basename(output_dir) + '_smurf-lite.hmm+'
output_dir = parsed_args.outputdir.rstrip('/') # remove trailing slash aligner = parsed_args.aligner smurf_lite_threshold = parsed_args.smurf_lite_threshold simev_frequency = parsed_args.simev_frequency simev_count = parsed_args.simev_count simev_threshold = parsed_args.simev_threshold # determine the amount of logging info to output if parsed_args.verbose: from logging import DEBUG from gargamel.logger import console_handler console_handler.setLevel(DEBUG) # summary of the program configuration config = {'output_dir' : output_dir, 'aligner' : aligner} logger.debug('Program configuration: ' + str(config)) # end the program if the output dir doesn't exist logger.debug('Checking whether output directory exists...') if not os.path.isdir(output_dir): logger.critical('Output directory ' + output_dir + ' does not yet exist') logger.critical('Please run generate-matt-alignments.py first') sys.exit(2) logger.debug('Determining which hierarchy levels were left out during ' 'training...') logger.debug(' output_dir contains: ' + str(os.listdir(output_dir))) sunids = filter(lambda x: os.path.isdir(os.path.join(output_dir, x)), os.listdir(output_dir)) logger.debug(' sunids: ' + str(sunids))
representative_field = parsed_args.repfield output_filename = 'positive_controls' # determine the amount of logging info to output if parsed_args.verbose: from logging import DEBUG from gargamel.logger import console_handler console_handler.setLevel(DEBUG) # configuration summary config = {'target_level' : target_level, 'target_sunid' : target_sunid, 'output_dir' : output_dir, 'representative_field' : representative_field} logger.debug('Program configuration: ' + str(config)) # get the set of non-redundant PDB chains from the NRPDB file logger.debug('Getting non-redundant set of PDB chains...') nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field) # logger.debug('nrpdbs: ' + str(nrpdbs)) # get all the records from the SCOP classification file # this now has (pdbid,chain) tuples logger.debug('Getting records to test from SCOP Classification file...') hierarchy = hierarchy_sets_from_file(SCOP_CLASSIFICATION_FILE, target_level, target_sunid)
argparser = AlignmentArgumentParser(PROGRAM_DESCRIPTION) parsed_args = argparser.parse_args() # the directory containing the results from the smurf/hmmer alignment tests output_dir = parsed_args.outputdir aligner = parsed_args.aligner # logger.debug('Aligner: ' + aligner) # determine the amount of logging info to output if parsed_args.verbose: from logging import DEBUG from gargamel.logger import console_handler console_handler.setLevel(DEBUG) # a summary of the runtime configuration of this program config = {'output_dir' : output_dir} logger.debug('Program configuration: ' + str(config)) # check if the output dir exists if not os.path.isdir(output_dir): logger.critical('Directory ' + output_dir + ' does not exist.') sys.exit(STATUS_NO_DIR) # find all subdirectories of the output directory logger.debug('Determining which family directories exist...') logger.debug(' output_dir contains: ' + str(os.listdir(output_dir))) subdirectories = filter(lambda x: os.path.isdir(os.path.join(output_dir, x)), os.listdir(output_dir)) logger.debug(' subdirectories: ' + str(subdirectories)) # get all subdirectories which are only digits # TODO this is not the best way to do this families = filter(lambda x: all(filter(lambda y: y.isdigit(), x)),