예제 #1
0
def extract_tar(filename, directory, suffixes=['.pdb']):
    # Extracting tarfile
    logger.info('Extracting files from tarfile: %s', filename)
    files = []
    with tarfile.open(filename, 'r:*') as tf:
        memb = tf.getmembers()
        if not len(memb):
            msg = 'Empty archive: {0}'.format(filename)
            exit_util.exit_error(msg)
        for m in memb:
            if os.path.splitext(m.name)[1] in suffixes:
                # Hack to remove any paths
                m.name = os.path.basename(m.name)
                tf.extract(m, path=directory)
                files.append(os.path.join(directory, m.name))
    if not len(files):
        msg = 'Could not find any files with suffixes {0} in archive: {1}'.format(
            suffixes, filename)
        exit_util.exit_error(msg)
    return files
예제 #2
0
def extract_zip(filename, directory, suffixes=None):
    # zip file extraction
    logger.info('Extracting files from zipfile: %s', filename)
    if not zipfile.is_zipfile(filename):
        msg = 'File is not a valid zip archive: {0}'.format(filename)
        exit_util.exit_error(msg)
    zipf = zipfile.ZipFile(filename)
    zif = zipf.infolist()
    if not zif:
        msg = 'Empty zip file: {0}'.format(filename)
        exit_util.exit_error(msg)
    files = []
    for f in zif:
        if os.path.splitext(f.filename)[1] in suffixes:
            # Hack to rewrite name
            f.filename = os.path.basename(f.filename)
            zipf.extract(f, path=directory)
            files.append(os.path.join(directory, f.filename))
    if not files:
        msg = 'Could not find any files with suffixes {0} in zipfile: {1}'.format(suffixes, filename)
        exit_util.exit_error(msg)
    return files
예제 #3
0
def extract_zip(filename, directory, suffixes=['.pdb']):
    # zip file extraction
    logger.info('Extracting files from zipfile: %s', filename)
    if not zipfile.is_zipfile(filename):
        msg = 'File is not a valid zip archive: {0}'.format(filename)
        exit_util.exit_error(msg)
    zipf = zipfile.ZipFile(filename)
    zif = zipf.infolist()
    if not len(zif):
        msg = 'Empty zip file: {0}'.format(filename)
        exit_util.exit_error(msg)
    files = []
    for f in zif:
        if os.path.splitext(f.filename)[1] in suffixes:
            # Hack to rewrite name
            f.filename = os.path.basename(f.filename)
            zipf.extract(f, path=directory)
            files.append(os.path.join(directory, f.filename))
    if not len(files):
        msg = 'Could not find any files with suffixes {0} in zipfile: {1}'.format(
            suffixes, filename)
        exit_util.exit_error(msg)
    return files
예제 #4
0
def process_fasta(amoptd):
    # Check we can find the input fasta
    if not os.path.exists(str(amoptd['fasta'])):
        msg = 'Cannot find fasta file: {0}'.format(amoptd['fasta'])
        exit_util.exit_error(msg)

    # Reformat to what we need
    logging.debug('Parsing FASTA file')
    try:
        fp = Sequence(fasta=amoptd['fasta'])
    except Exception as e:
        msg = "Error parsing FASTA file: {0}\n\n{1}".format(
            amoptd['fasta'], e.message)
        exit_util.exit_error(msg)
    if fp.numSequences() != 1:
        msg = "ERROR! Fasta file {0} has > 1 sequence in it.".format(
            amoptd['fasta'])
        exit_util.exit_error(msg)

    # Length checks
    amoptd['fasta_length'] = fp.length()
    logging.info("Fasta is {0} amino acids long".format(
        amoptd['fasta_length']))

    # Check we have a decent length
    if amoptd['fasta_length'] < 9:
        msg = "ERROR! Fasta is of length {0}. This is much too short!".format(
            amoptd['fasta_length'])
        exit_util.exit_error(msg)

    # Check we will be able to truncate at this level
    if (float(amoptd['fasta_length']) / 100) * float(amoptd['percent']) < 1:
        msg = "Cannot truncate a fasta sequence of length {0} with {1} percent intervals. Please select a larger interval.".format(
            amoptd['fasta_length'], amoptd['percent'])
        exit_util.exit_error(msg)

    # Check that the sequence doesn't have a his-tag in it
    if not amoptd['allow_his_tag']:
        his_tag = 'HHHHHH'
        i = fp.sequence().find(his_tag)
        l = fp.length()
        if (0 <= i <= 20) or (l - 20 <= i <= l):
            msg = 'The fasta sequence contains a his tag sequence {0} at position {1}. If you wish to use ample with this sequence, please use the \"-allow_his_tag True\" option'.format(
                his_tag, i)
            exit_util.exit_error(msg)

    # Fasta is ok, so write out a canonical fasta in the work directory
    outfasta = os.path.join(amoptd['work_dir'], amoptd['name'] + '_.fasta')
    fp.write_fasta(outfasta)
    amoptd['fasta'] = outfasta
    amoptd['sequence'] = fp.sequence()

    return
예제 #5
0
파일: mtz_util.py 프로젝트: hlasimpk/ample
def processReflectionFile(amoptd):
    """Make sure we have a valid mtz file. If necessary convert a given cif file.
       Set the mtz variable in the given amoptd to the reflection file to use
       Return True if it all worked or raise an exception if it failed
    """

    # We've been given a sf_cif so convert to mtz
    if amoptd['sf_cif']:
        if not os.path.isfile(amoptd['sf_cif']):
            msg = "Cannot find sf_cif file: {0}".format(amoptd['sf_cif'])
            exit_util.exit_error(msg)
        if not os.path.splitext(amoptd['sf_cif'])[1].lower() == ".cif":
            msg = "Cif file extension is not .cif Please rename the file to give it a .cif extension."
            exit_util.exit_error(msg)

        cp = cif_parser.CifParser()
        amoptd['mtz'] = cp.sfcif2mtz(amoptd['sf_cif'])

    # Now have an mtz so check it's valid
    if not amoptd['mtz'] or not os.path.isfile(amoptd['mtz']):
        logger.critical("Cannot find MTZ file: %s", amoptd['mtz'])
        sys.exit(1)

    # Get column label info
    reflection_file = reflection_file_reader.any_reflection_file(
        file_name=amoptd['mtz'])
    if not reflection_file.file_type() == "ccp4_mtz":
        logger.critical("File is not of type ccp4_mtz: %s", amoptd['mtz'])
        sys.exit(1)

    # Read the file
    content = reflection_file.file_content()

    # Check any user-given flags
    for flag in ['F', 'SIGF', 'FREE']:
        if amoptd[flag] and amoptd[flag] not in content.column_labels():
            logger.critical("Cannot find flag %s label %s in mtz file %s",
                            flag, amoptd[flag], amoptd['mtz'])
            sys.exit(1)

    # If any of the flags aren't given we set defaults based on what's in the file
    if not amoptd['F']:
        if 'F' not in content.column_types():
            logger.critical(
                "Cannot find column type F for flag F in mtz file: %s",
                amoptd['mtz'])
            sys.exit(1)
        amoptd['F'] = content.column_labels()[content.column_types().index(
            'F')]
    if not amoptd['SIGF']:
        l = 'SIG' + amoptd['F']
        if not l in content.column_labels():
            logger.critical(
                "Cannot find column type %s for flag SIGF in mtz file: %s", l,
                amoptd['mtz'])
            sys.exit(1)
        amoptd['SIGF'] = l

    rfree = _get_rfree(content)
    if amoptd['FREE']:
        # Check is valid
        if not rfree or not rfree == amoptd['FREE']:
            logger.critical(
                "Given RFREE label %s is not valid for mtz file: %s",
                amoptd['FREE'], amoptd['mtz'])
            sys.exit(1)
    else:
        # See if we can find a valid label in the file
        if not rfree:
            # Need to generate RFREE
            logger.warning(
                "Cannot find a valid FREE flag - running uniquefy to generate column with RFREE data."
            )
            amoptd['mtz'] = add_rfree(amoptd['mtz'],
                                      directory=amoptd['work_dir'],
                                      overwrite=False)

            # Check file and get new FREE flag
            rfree = get_rfree(amoptd['mtz'])
            if not rfree:
                logger.critical(
                    "Cannot find valid rfree flag in mtz file %s after running uniquiefy",
                    amoptd['mtz'])
                sys.exit(1)
        amoptd['FREE'] = rfree

    # Output information to user and save to amoptd
    logger.info("Using MTZ file: %s", amoptd['mtz'])
    maxr, minr = content.max_min_resolution()
    amoptd['mtz_max_resolution'] = maxr
    amoptd['mtz_min_resolution'] = minr
    msg = "Resolution limits of MTZ file are: {0: > 6.3F} and {1: > 6.3F}".format(
        maxr, minr)
    logger.info(msg)

    return True
예제 #6
0
def process_fasta(amoptd, canonicalise=False):
    # Check we can find the input fasta
    if not os.path.exists(str(amoptd['fasta'])):
        msg = 'Cannot find fasta file: {0}'.format(amoptd['fasta'])
        exit_util.exit_error(msg)
    
    # Reformat to what we need
    logging.debug('Parsing FASTA file')
    try: fp = Sequence(fasta=amoptd['fasta'], canonicalise=canonicalise)
    except Exception as e:
        msg = "Error parsing FASTA file: {0}\n\n{1}".format(amoptd['fasta'],e.message)
        exit_util.exit_error(msg)
    if fp.numSequences() != 1:
        msg = "ERROR! Fasta file {0} has > 1 sequence in it.".format(amoptd['fasta'])
        exit_util.exit_error(msg)
    
    # Length checks
    amoptd['fasta_length'] = fp.length()
    logging.info("Fasta is {0} amino acids long".format(amoptd['fasta_length']))
    
    # Check we have a decent length
    if amoptd['fasta_length'] < 9:
        msg = "ERROR! Fasta is of length {0}. This is much too short!".format(amoptd['fasta_length'])
        exit_util.exit_error(msg)
    
    # Check we will be able to truncate at this level
    if (float(amoptd['fasta_length']) / 100) * float(amoptd['percent']) < 1:
        msg = "Cannot truncate a fasta sequence of length {0} with {1} percent intervals. Please select a larger interval.".format(amoptd['fasta_length'], amoptd['percent'])
        exit_util.exit_error(msg)
        
    # Check that the sequence doesn't have a his-tag in it
    if not amoptd['allow_his_tag']:
        his_tag = 'HHHHHH'
        i = fp.sequence().find(his_tag)
        l = fp.length()
        if (0 <= i <= 20) or (l-20 <= i <= l):
            msg = 'The fasta sequence contains a his tag sequence {0} at position {1}. If you wish to use ample with this sequence, please use the \"-allow_his_tag True\" option'.format(his_tag,i)
            exit_util.exit_error(msg)
    
    # Fasta is ok, so write out a canonical fasta in the work directory
    outfasta = os.path.join(amoptd['work_dir'], amoptd['name'] + '_.fasta')
    fp.write_fasta(outfasta)
    amoptd['fasta'] = outfasta
    amoptd['sequence'] = fp.sequence()

    return
예제 #7
0
def extract_models(amoptd, sequence=None, single=True, allsame=True):
    """Extract some models
    
    Description
    -----------
    Check a directory of pdbs or extract pdb files from a given tar/zip file or directory of pdbs
    and set the amoptd['models_dir'] entry with the directory of unpacked/validated pdbs
    """

    filename = amoptd['models']
    models_dir = amoptd['models_dir']

    # If it's already a models_dir, just check it's valid
    if os.path.isdir(filename):
        models_dir = filename
    else:
        # Here we are extracting from a file
        if not os.path.isfile(filename):
            msg = "Cannot find models file: {0}".format(filename)
            exit_util.exit_error(msg)

        # we need a models_dir to extract into
        assert models_dir, "extract_models() needs a models_dir path!"
        if not os.path.isdir(models_dir):
            os.mkdir(models_dir)
        models_dir = models_dir

        # See what sort of file this is:
        f, suffix = os.path.splitext(filename)
        if suffix in ['.gz', '.bz']:
            f, s2 = os.path.splitext(f)
            if s2 == '.tar':
                suffix = s2 + suffix

        tar_suffixes = ['.tar.gz', '.tgz', '.tar.bz', '.tbz']
        suffixes = tar_suffixes + ['.zip']
        if suffix not in suffixes:
            msg = "Do not know how to extract files from file: {0}\n " \
                  "Acceptable file types are: {1}".format(filename, suffixes)
            exit_util.exit_error(msg)
        if suffix in tar_suffixes:
            files = extract_tar(filename, models_dir)
        else:
            files = extract_zip(filename, models_dir)

        # Assume anything with one member is quark decoys
        if len(files) == 1:
            quark_filename = 'alldecoy.pdb'
            f = os.path.basename(files[0])
            if f != quark_filename:
                msg = "Only found one member ({0}) in file: {1} " \
                      "and the name was not {2}\n".format(f, filename, quark_filename)
                msg += "If this file contains valid QUARK decoys, please email: [email protected]"
                exit_util.exit_error(msg)
            # Now extract the quark pdb files from the monolithic file
            split_quark(files[0], models_dir)
            # We delete the quark_name file as otherwise we'll try and model it
            os.unlink(files[0])
            # If we've got quark models we don't want to modify the side chains as we only have polyalanine so we
            # set this here - horribly untidy as we should have one place to decide on side chains
            logger.info('Found QUARK models in file: %s', filename)
            amoptd['quark_models'] = True

    if not pdb_edit.check_pdb_directory(
            models_dir, sequence=sequence, single=single, allsame=allsame):
        msg = "Problem importing pdb files - please check the log for more information"
        exit_util.exit_error(msg)

    amoptd['models_dir'] = models_dir
    return glob.glob(os.path.join(models_dir, "*.pdb"))
예제 #8
0
def extract_and_validate_models(amoptd,
                                sequence=None,
                                single=True,
                                allsame=True):
    """Extract models given to AMPLE from arguments in the amoptd and validate
    that they are suitable

    Parameters
    ----------
    amoptd : dict
       AMPLE options dictionary
    sequence : str
       single-letter protein sequence - if given a check will be made that all
       models are of this sequence
    single : bool
       if True check each pdb only contains a single model
    allsame : bool
       only extract a file if the suffix is in the list

    """

    filepath = amoptd['models']
    models_dir = amoptd['models_dir']

    filenames = None
    quark_models = False

    if os.path.isfile(filepath):
        basename = os.path.basename(filepath)
        if basename in ['result.tar.bz2', 'decoys.tar.gz']:
            logger.info('Assuming QUARK models in file: %s', filepath)
            quark_models = True
            filenames = ['alldecoy.pdb']
        try:
            extract_models_from_archive(filepath,
                                        models_dir,
                                        suffixes=['.pdb', '.PDB'],
                                        filenames=filenames)
        except Exception as e:
            exit_util.exit_error(
                "Error extracting models from file: {0}\n{1}".format(
                    filepath, e))
    elif os.path.isdir(filepath):
        models_dir = filepath

    if quark_models:
        try:
            split_quark(models_dir)
        except Exception as e:
            exit_util.exit_error(
                "Error splitting QUARK models from file: {0}\n{1}".format(
                    filepath, e))
        amoptd['quark_models'] = True

    if not pdb_edit.check_pdb_directory(
            models_dir, sequence=sequence, single=single, allsame=allsame):
        msg = "Problem importing pdb files - please check the log for more information"
        exit_util.exit_error(msg)

    amoptd['models_dir'] = models_dir
    return glob.glob(os.path.join(models_dir, "*.pdb"))
예제 #9
0
def processReflectionFile(amoptd):
    """Make sure we have a valid mtz file. If necessary convert a given cif file.
       Set the mtz variable in the given amoptd to the reflection file to use
       Return True if it all worked or raise an exception if it failed
    """

    # We've been given a sf_cif so convert to mtz
    if amoptd['sf_cif']:
        if not os.path.isfile(amoptd['sf_cif']):
            msg="Cannot find sf_cif file: {0}".format(amoptd['sf_cif'])
            exit_util.exit_error(msg)
        if not os.path.splitext(amoptd['sf_cif'])[1].lower() == ".cif":
            msg="Cif file extension is not .cif Please rename the file to give it a .cif extension."
            exit_util.exit_error(msg)

        cp = cif_parser.CifParser()
        amoptd['mtz'] = cp.sfcif2mtz(amoptd['sf_cif'])

    # Now have an mtz so check it's valid
    if not amoptd['mtz'] or not os.path.isfile( amoptd['mtz'] ):
        logger.critical("Cannot find MTZ file: %s", amoptd['mtz'])
        sys.exit(1)

    # Get column label info
    reflection_file = reflection_file_reader.any_reflection_file(file_name=amoptd['mtz'])
    if not reflection_file.file_type() == "ccp4_mtz":
        logger.critical("File is not of type ccp4_mtz: %s", amoptd['mtz'])
        sys.exit(1)

    # Read the file
    content = reflection_file.file_content()

    # Check any user-given flags
    for flag in ['F','SIGF','FREE']:
        if amoptd[flag] and amoptd[flag] not in content.column_labels():
            logger.critical("Cannot find flag %s label %s in mtz file %s", flag, amoptd[flag], amoptd['mtz'])
            sys.exit(1)

    # If any of the flags aren't given we set defaults based on what's in the file
    if not amoptd['F']:
        if 'F' not in content.column_types():
            logger.critical("Cannot find column type F for flag F in mtz file: %s", amoptd['mtz'])
            sys.exit(1)
        amoptd['F']  = content.column_labels()[content.column_types().index('F')]
    if not amoptd['SIGF']:
        l='SIG'+amoptd['F']
        if not l in content.column_labels():
            logger.critical("Cannot find column type %s for flag SIGF in mtz file: %s", l, amoptd['mtz'])
            sys.exit(1)
        amoptd['SIGF']  = l

    rfree=_get_rfree(content)
    if amoptd['FREE']:
        # Check is valid
        if not rfree or not rfree==amoptd['FREE']:
            logger.critical("Given RFREE label %s is not valid for mtz file: %s", amoptd['FREE'], amoptd['mtz'])
            sys.exit(1)
    else:
        # See if we can find a valid label in the file
        if not rfree:
            # Need to generate RFREE
            logger.warning("Cannot find a valid FREE flag - running uniquefy to generate column with RFREE data.")
            amoptd['mtz'] = add_rfree(amoptd['mtz'], directory=amoptd['work_dir'], overwrite=False)

            # Check file and get new FREE flag
            rfree=get_rfree(amoptd['mtz'])
            if not rfree:
                logger.critical("Cannot find valid rfree flag in mtz file %s after running uniquiefy", amoptd['mtz'])
                sys.exit(1)
        amoptd['FREE']  = rfree

    # Output information to user and save to amoptd
    logger.info("Using MTZ file: %s", amoptd['mtz'])
    maxr, minr = content.max_min_resolution()
    amoptd['mtz_min_resolution'] = minr
    amoptd['mtz_max_resolution'] = maxr
    msg = "Resolution limits of MTZ file are: {0: > 6.3F} and {1: > 6.3F}".format(minr, maxr)
    logger.info(msg)

    return True