def extract_tar(filename, directory, suffixes=['.pdb']): # Extracting tarfile logger.info('Extracting files from tarfile: %s', filename) files = [] with tarfile.open(filename, 'r:*') as tf: memb = tf.getmembers() if not len(memb): msg = 'Empty archive: {0}'.format(filename) exit_util.exit_error(msg) for m in memb: if os.path.splitext(m.name)[1] in suffixes: # Hack to remove any paths m.name = os.path.basename(m.name) tf.extract(m, path=directory) files.append(os.path.join(directory, m.name)) if not len(files): msg = 'Could not find any files with suffixes {0} in archive: {1}'.format( suffixes, filename) exit_util.exit_error(msg) return files
def extract_zip(filename, directory, suffixes=None): # zip file extraction logger.info('Extracting files from zipfile: %s', filename) if not zipfile.is_zipfile(filename): msg = 'File is not a valid zip archive: {0}'.format(filename) exit_util.exit_error(msg) zipf = zipfile.ZipFile(filename) zif = zipf.infolist() if not zif: msg = 'Empty zip file: {0}'.format(filename) exit_util.exit_error(msg) files = [] for f in zif: if os.path.splitext(f.filename)[1] in suffixes: # Hack to rewrite name f.filename = os.path.basename(f.filename) zipf.extract(f, path=directory) files.append(os.path.join(directory, f.filename)) if not files: msg = 'Could not find any files with suffixes {0} in zipfile: {1}'.format(suffixes, filename) exit_util.exit_error(msg) return files
def extract_zip(filename, directory, suffixes=['.pdb']): # zip file extraction logger.info('Extracting files from zipfile: %s', filename) if not zipfile.is_zipfile(filename): msg = 'File is not a valid zip archive: {0}'.format(filename) exit_util.exit_error(msg) zipf = zipfile.ZipFile(filename) zif = zipf.infolist() if not len(zif): msg = 'Empty zip file: {0}'.format(filename) exit_util.exit_error(msg) files = [] for f in zif: if os.path.splitext(f.filename)[1] in suffixes: # Hack to rewrite name f.filename = os.path.basename(f.filename) zipf.extract(f, path=directory) files.append(os.path.join(directory, f.filename)) if not len(files): msg = 'Could not find any files with suffixes {0} in zipfile: {1}'.format( suffixes, filename) exit_util.exit_error(msg) return files
def process_fasta(amoptd): # Check we can find the input fasta if not os.path.exists(str(amoptd['fasta'])): msg = 'Cannot find fasta file: {0}'.format(amoptd['fasta']) exit_util.exit_error(msg) # Reformat to what we need logging.debug('Parsing FASTA file') try: fp = Sequence(fasta=amoptd['fasta']) except Exception as e: msg = "Error parsing FASTA file: {0}\n\n{1}".format( amoptd['fasta'], e.message) exit_util.exit_error(msg) if fp.numSequences() != 1: msg = "ERROR! Fasta file {0} has > 1 sequence in it.".format( amoptd['fasta']) exit_util.exit_error(msg) # Length checks amoptd['fasta_length'] = fp.length() logging.info("Fasta is {0} amino acids long".format( amoptd['fasta_length'])) # Check we have a decent length if amoptd['fasta_length'] < 9: msg = "ERROR! Fasta is of length {0}. This is much too short!".format( amoptd['fasta_length']) exit_util.exit_error(msg) # Check we will be able to truncate at this level if (float(amoptd['fasta_length']) / 100) * float(amoptd['percent']) < 1: msg = "Cannot truncate a fasta sequence of length {0} with {1} percent intervals. Please select a larger interval.".format( amoptd['fasta_length'], amoptd['percent']) exit_util.exit_error(msg) # Check that the sequence doesn't have a his-tag in it if not amoptd['allow_his_tag']: his_tag = 'HHHHHH' i = fp.sequence().find(his_tag) l = fp.length() if (0 <= i <= 20) or (l - 20 <= i <= l): msg = 'The fasta sequence contains a his tag sequence {0} at position {1}. If you wish to use ample with this sequence, please use the \"-allow_his_tag True\" option'.format( his_tag, i) exit_util.exit_error(msg) # Fasta is ok, so write out a canonical fasta in the work directory outfasta = os.path.join(amoptd['work_dir'], amoptd['name'] + '_.fasta') fp.write_fasta(outfasta) amoptd['fasta'] = outfasta amoptd['sequence'] = fp.sequence() return
def processReflectionFile(amoptd): """Make sure we have a valid mtz file. If necessary convert a given cif file. Set the mtz variable in the given amoptd to the reflection file to use Return True if it all worked or raise an exception if it failed """ # We've been given a sf_cif so convert to mtz if amoptd['sf_cif']: if not os.path.isfile(amoptd['sf_cif']): msg = "Cannot find sf_cif file: {0}".format(amoptd['sf_cif']) exit_util.exit_error(msg) if not os.path.splitext(amoptd['sf_cif'])[1].lower() == ".cif": msg = "Cif file extension is not .cif Please rename the file to give it a .cif extension." exit_util.exit_error(msg) cp = cif_parser.CifParser() amoptd['mtz'] = cp.sfcif2mtz(amoptd['sf_cif']) # Now have an mtz so check it's valid if not amoptd['mtz'] or not os.path.isfile(amoptd['mtz']): logger.critical("Cannot find MTZ file: %s", amoptd['mtz']) sys.exit(1) # Get column label info reflection_file = reflection_file_reader.any_reflection_file( file_name=amoptd['mtz']) if not reflection_file.file_type() == "ccp4_mtz": logger.critical("File is not of type ccp4_mtz: %s", amoptd['mtz']) sys.exit(1) # Read the file content = reflection_file.file_content() # Check any user-given flags for flag in ['F', 'SIGF', 'FREE']: if amoptd[flag] and amoptd[flag] not in content.column_labels(): logger.critical("Cannot find flag %s label %s in mtz file %s", flag, amoptd[flag], amoptd['mtz']) sys.exit(1) # If any of the flags aren't given we set defaults based on what's in the file if not amoptd['F']: if 'F' not in content.column_types(): logger.critical( "Cannot find column type F for flag F in mtz file: %s", amoptd['mtz']) sys.exit(1) amoptd['F'] = content.column_labels()[content.column_types().index( 'F')] if not amoptd['SIGF']: l = 'SIG' + amoptd['F'] if not l in content.column_labels(): logger.critical( "Cannot find column type %s for flag SIGF in mtz file: %s", l, amoptd['mtz']) sys.exit(1) amoptd['SIGF'] = l rfree = _get_rfree(content) if amoptd['FREE']: # Check is valid if not rfree or not rfree == amoptd['FREE']: logger.critical( "Given RFREE label %s is not valid for mtz file: %s", amoptd['FREE'], amoptd['mtz']) sys.exit(1) else: # See if we can find a valid label in the file if not rfree: # Need to generate RFREE logger.warning( "Cannot find a valid FREE flag - running uniquefy to generate column with RFREE data." ) amoptd['mtz'] = add_rfree(amoptd['mtz'], directory=amoptd['work_dir'], overwrite=False) # Check file and get new FREE flag rfree = get_rfree(amoptd['mtz']) if not rfree: logger.critical( "Cannot find valid rfree flag in mtz file %s after running uniquiefy", amoptd['mtz']) sys.exit(1) amoptd['FREE'] = rfree # Output information to user and save to amoptd logger.info("Using MTZ file: %s", amoptd['mtz']) maxr, minr = content.max_min_resolution() amoptd['mtz_max_resolution'] = maxr amoptd['mtz_min_resolution'] = minr msg = "Resolution limits of MTZ file are: {0: > 6.3F} and {1: > 6.3F}".format( maxr, minr) logger.info(msg) return True
def process_fasta(amoptd, canonicalise=False): # Check we can find the input fasta if not os.path.exists(str(amoptd['fasta'])): msg = 'Cannot find fasta file: {0}'.format(amoptd['fasta']) exit_util.exit_error(msg) # Reformat to what we need logging.debug('Parsing FASTA file') try: fp = Sequence(fasta=amoptd['fasta'], canonicalise=canonicalise) except Exception as e: msg = "Error parsing FASTA file: {0}\n\n{1}".format(amoptd['fasta'],e.message) exit_util.exit_error(msg) if fp.numSequences() != 1: msg = "ERROR! Fasta file {0} has > 1 sequence in it.".format(amoptd['fasta']) exit_util.exit_error(msg) # Length checks amoptd['fasta_length'] = fp.length() logging.info("Fasta is {0} amino acids long".format(amoptd['fasta_length'])) # Check we have a decent length if amoptd['fasta_length'] < 9: msg = "ERROR! Fasta is of length {0}. This is much too short!".format(amoptd['fasta_length']) exit_util.exit_error(msg) # Check we will be able to truncate at this level if (float(amoptd['fasta_length']) / 100) * float(amoptd['percent']) < 1: msg = "Cannot truncate a fasta sequence of length {0} with {1} percent intervals. Please select a larger interval.".format(amoptd['fasta_length'], amoptd['percent']) exit_util.exit_error(msg) # Check that the sequence doesn't have a his-tag in it if not amoptd['allow_his_tag']: his_tag = 'HHHHHH' i = fp.sequence().find(his_tag) l = fp.length() if (0 <= i <= 20) or (l-20 <= i <= l): msg = 'The fasta sequence contains a his tag sequence {0} at position {1}. If you wish to use ample with this sequence, please use the \"-allow_his_tag True\" option'.format(his_tag,i) exit_util.exit_error(msg) # Fasta is ok, so write out a canonical fasta in the work directory outfasta = os.path.join(amoptd['work_dir'], amoptd['name'] + '_.fasta') fp.write_fasta(outfasta) amoptd['fasta'] = outfasta amoptd['sequence'] = fp.sequence() return
def extract_models(amoptd, sequence=None, single=True, allsame=True): """Extract some models Description ----------- Check a directory of pdbs or extract pdb files from a given tar/zip file or directory of pdbs and set the amoptd['models_dir'] entry with the directory of unpacked/validated pdbs """ filename = amoptd['models'] models_dir = amoptd['models_dir'] # If it's already a models_dir, just check it's valid if os.path.isdir(filename): models_dir = filename else: # Here we are extracting from a file if not os.path.isfile(filename): msg = "Cannot find models file: {0}".format(filename) exit_util.exit_error(msg) # we need a models_dir to extract into assert models_dir, "extract_models() needs a models_dir path!" if not os.path.isdir(models_dir): os.mkdir(models_dir) models_dir = models_dir # See what sort of file this is: f, suffix = os.path.splitext(filename) if suffix in ['.gz', '.bz']: f, s2 = os.path.splitext(f) if s2 == '.tar': suffix = s2 + suffix tar_suffixes = ['.tar.gz', '.tgz', '.tar.bz', '.tbz'] suffixes = tar_suffixes + ['.zip'] if suffix not in suffixes: msg = "Do not know how to extract files from file: {0}\n " \ "Acceptable file types are: {1}".format(filename, suffixes) exit_util.exit_error(msg) if suffix in tar_suffixes: files = extract_tar(filename, models_dir) else: files = extract_zip(filename, models_dir) # Assume anything with one member is quark decoys if len(files) == 1: quark_filename = 'alldecoy.pdb' f = os.path.basename(files[0]) if f != quark_filename: msg = "Only found one member ({0}) in file: {1} " \ "and the name was not {2}\n".format(f, filename, quark_filename) msg += "If this file contains valid QUARK decoys, please email: [email protected]" exit_util.exit_error(msg) # Now extract the quark pdb files from the monolithic file split_quark(files[0], models_dir) # We delete the quark_name file as otherwise we'll try and model it os.unlink(files[0]) # If we've got quark models we don't want to modify the side chains as we only have polyalanine so we # set this here - horribly untidy as we should have one place to decide on side chains logger.info('Found QUARK models in file: %s', filename) amoptd['quark_models'] = True if not pdb_edit.check_pdb_directory( models_dir, sequence=sequence, single=single, allsame=allsame): msg = "Problem importing pdb files - please check the log for more information" exit_util.exit_error(msg) amoptd['models_dir'] = models_dir return glob.glob(os.path.join(models_dir, "*.pdb"))
def extract_and_validate_models(amoptd, sequence=None, single=True, allsame=True): """Extract models given to AMPLE from arguments in the amoptd and validate that they are suitable Parameters ---------- amoptd : dict AMPLE options dictionary sequence : str single-letter protein sequence - if given a check will be made that all models are of this sequence single : bool if True check each pdb only contains a single model allsame : bool only extract a file if the suffix is in the list """ filepath = amoptd['models'] models_dir = amoptd['models_dir'] filenames = None quark_models = False if os.path.isfile(filepath): basename = os.path.basename(filepath) if basename in ['result.tar.bz2', 'decoys.tar.gz']: logger.info('Assuming QUARK models in file: %s', filepath) quark_models = True filenames = ['alldecoy.pdb'] try: extract_models_from_archive(filepath, models_dir, suffixes=['.pdb', '.PDB'], filenames=filenames) except Exception as e: exit_util.exit_error( "Error extracting models from file: {0}\n{1}".format( filepath, e)) elif os.path.isdir(filepath): models_dir = filepath if quark_models: try: split_quark(models_dir) except Exception as e: exit_util.exit_error( "Error splitting QUARK models from file: {0}\n{1}".format( filepath, e)) amoptd['quark_models'] = True if not pdb_edit.check_pdb_directory( models_dir, sequence=sequence, single=single, allsame=allsame): msg = "Problem importing pdb files - please check the log for more information" exit_util.exit_error(msg) amoptd['models_dir'] = models_dir return glob.glob(os.path.join(models_dir, "*.pdb"))
def processReflectionFile(amoptd): """Make sure we have a valid mtz file. If necessary convert a given cif file. Set the mtz variable in the given amoptd to the reflection file to use Return True if it all worked or raise an exception if it failed """ # We've been given a sf_cif so convert to mtz if amoptd['sf_cif']: if not os.path.isfile(amoptd['sf_cif']): msg="Cannot find sf_cif file: {0}".format(amoptd['sf_cif']) exit_util.exit_error(msg) if not os.path.splitext(amoptd['sf_cif'])[1].lower() == ".cif": msg="Cif file extension is not .cif Please rename the file to give it a .cif extension." exit_util.exit_error(msg) cp = cif_parser.CifParser() amoptd['mtz'] = cp.sfcif2mtz(amoptd['sf_cif']) # Now have an mtz so check it's valid if not amoptd['mtz'] or not os.path.isfile( amoptd['mtz'] ): logger.critical("Cannot find MTZ file: %s", amoptd['mtz']) sys.exit(1) # Get column label info reflection_file = reflection_file_reader.any_reflection_file(file_name=amoptd['mtz']) if not reflection_file.file_type() == "ccp4_mtz": logger.critical("File is not of type ccp4_mtz: %s", amoptd['mtz']) sys.exit(1) # Read the file content = reflection_file.file_content() # Check any user-given flags for flag in ['F','SIGF','FREE']: if amoptd[flag] and amoptd[flag] not in content.column_labels(): logger.critical("Cannot find flag %s label %s in mtz file %s", flag, amoptd[flag], amoptd['mtz']) sys.exit(1) # If any of the flags aren't given we set defaults based on what's in the file if not amoptd['F']: if 'F' not in content.column_types(): logger.critical("Cannot find column type F for flag F in mtz file: %s", amoptd['mtz']) sys.exit(1) amoptd['F'] = content.column_labels()[content.column_types().index('F')] if not amoptd['SIGF']: l='SIG'+amoptd['F'] if not l in content.column_labels(): logger.critical("Cannot find column type %s for flag SIGF in mtz file: %s", l, amoptd['mtz']) sys.exit(1) amoptd['SIGF'] = l rfree=_get_rfree(content) if amoptd['FREE']: # Check is valid if not rfree or not rfree==amoptd['FREE']: logger.critical("Given RFREE label %s is not valid for mtz file: %s", amoptd['FREE'], amoptd['mtz']) sys.exit(1) else: # See if we can find a valid label in the file if not rfree: # Need to generate RFREE logger.warning("Cannot find a valid FREE flag - running uniquefy to generate column with RFREE data.") amoptd['mtz'] = add_rfree(amoptd['mtz'], directory=amoptd['work_dir'], overwrite=False) # Check file and get new FREE flag rfree=get_rfree(amoptd['mtz']) if not rfree: logger.critical("Cannot find valid rfree flag in mtz file %s after running uniquiefy", amoptd['mtz']) sys.exit(1) amoptd['FREE'] = rfree # Output information to user and save to amoptd logger.info("Using MTZ file: %s", amoptd['mtz']) maxr, minr = content.max_min_resolution() amoptd['mtz_min_resolution'] = minr amoptd['mtz_max_resolution'] = maxr msg = "Resolution limits of MTZ file are: {0: > 6.3F} and {1: > 6.3F}".format(minr, maxr) logger.info(msg) return True