Exemplo n.º 1
0
def write(infile, ftype, indata, ck=False):
    """

    :param infile: Path to input file.
    :type infile: str
    :param ftype: One of 'psicov', 'ccmpred', 'fasta', 'pdb', 'a3m', 'jones', 'xml'.
    :type ftype: str
    :param ck: Open alternative conkit version instead of default, defaults to False.
    :type ck: bool, optional
    :return: Parsed file (and, for 'pdb', list of filenames).
    :rtype: One or two of list[str], :class:`~crops.elements.sequences.sequence`, :class:`~conkit.core.sequence.Sequence`,

    """
    if (ftype.lower() not in _ftypelist() or
            isinstance(ftype, str) is not True):
        logging.critical('Specified type not valid.')
        raise ValueError

    if ck is True and ftype.lower() != 'xml':
        output = ckio.write(infile, ftype.lower(), hyerarchy=indata)
    else:
        if ftype.lower() == 'psicov':
            pass
        if ftype.lower() == 'ccmpred':
            pass
        elif ftype.lower() == 'fasta':
            output = cps.parseseqfile(infile)
        elif ftype.lower() == 'pdb':
            output1, output2 = cps.parsestrfile(infile)
            return output1, output2
        elif ftype.lower() == 'a3m' or 'jones':
            output = ckio.read(infile, ftype.lower())
        elif ftype.lower() == 'xml':
            output = ET.parse(infile)
    return output
Exemplo n.º 2
0
def main():
    """Remove a number of residues from sequence and structure files in agreement to the intervals and other details supplied.

    :raises ValueError: For wrong argument values.

    """
    # INITIALISE AND PARSE ARGUMENTS FROM COMMAND LINE
    parser = create_argument_parser()
    args = parser.parse_args()

    global logger
    logger = ccl.crops_logger(level="info")
    logger.info(ccl._welcome())

    inseq = check_path(args.input_seqpath[0], 'file')
    indb = check_path(args.input_database[0], 'file')
    instr = check_path(args.input_strpath[0])
    if args.uniprot_threshold is not None:
        if args.uniprot_threshold[1] != 'server-only':
            insprot = check_path(args.uniprot_threshold[1])
        else:
            insprot = 'server-only'
    else:
        insprot = None

    if args.uniprot_threshold is not None:
        minlen = float(args.uniprot_threshold[0])
        if minlen < 0.0 or minlen > 100.0:
            logger.critical(
                'The UniProt threshold is a percentage and, therefore, it must fulfil 0 < threshold < 100.'
            )
            raise ValueError
    else:
        minlen = 0.0
    targetlbl = ctg.target_format(indb, terms=args.terminals, th=minlen)
    infixlbl = ctg.infix_gen(indb, terms=args.terminals)

    if args.outdir is None:
        outdir = check_path(os.path.dirname(inseq), 'dir')
    else:
        outdir = check_path(os.path.join(args.outdir[0], ''), 'dir')

    # PARSE INPUT FILES
    logger.info('Parsing sequence file ' + inseq)
    if args.preselect is not None:
        subset = set(args.preselect)
    else:
        subset = None
    seqset = cin.parseseqfile(seq_input=inseq, inset=subset)
    logger.info('Done')

    logger.info('Parsing structure file ' + instr)
    strset, fileset = cin.parsestrfile(instr)
    logger.info('Done')

    logger.info('Parsing interval database file ' + indb)
    if len(seqset) > 0:
        intervals = cin.import_db(indb, pdb_in=seqset)
    else:
        logger.critical('No chains were imported from sequence file.')
        raise ValueError
    logger.info('Done' + os.linesep)

    if insprot is not None and minlen > 0.0:
        logger.info('Parsing uniprot sequence file: ' + insprot)
        uniprotset = set()
        for seqncid, seqnc in seqset.items():
            chains = seqnc.chainlist()
            for monomerid in chains:
                monomer = seqnc.imer[seqnc.whatseq(monomerid)]
                if 'uniprot' in intervals[seqncid][monomerid].tags:
                    for key in intervals[seqncid][monomerid].tags['uniprot']:
                        if key.upper() not in uniprotset:
                            uniprotset.add(key.upper())

        upserver = True if insprot == 'server-only' else False
        uniprotset = cin.parseseqfile(seq_input=insprot,
                                      inset=uniprotset,
                                      use_UPserver=upserver)
        logger.info('Done' + os.linesep)

    # MAIN OPERATION / PRINT OUT RESULTS WITHIN
    gseqset = {}
    strset2 = {}
    logger.info('Renumbering structure(s)...')
    for key, structure in strset.items():
        found = False
        for seqname in seqset:
            if ((seqname in key) or (len(seqset) == 1 and len(strset) == 1)):
                finalid = seqname
                newstructure, gseqset[seqname] = cop.renumber_pdb(
                    seqset[seqname], structure, seqback=True)
                fout = finalid + infixlbl["renumber"] + os.path.splitext(
                    instr)[1]
                outstr = outpathgen(outdir,
                                    subdir=finalid,
                                    filename=fout,
                                    mksubdir=True)
                newstructure.write_minimal_pdb(outstr)
                strset2[finalid] = structure
                found = True
        if found is False:
            logger.warning("Identifier '" + key +
                           "' not found in sequence input.")

    logger.info('Done' + os.linesep)
    logger.info('Cropping renumbered structure(s)...')
    outseq = os.path.join(
        outdir,
        os.path.splitext(os.path.basename(inseq))[0] + infixlbl["croprenum"] +
        os.path.splitext(os.path.basename(inseq))[1])
    for key, S in gseqset.items():
        newS = S.deepcopy()
        if key in intervals:
            if insprot is not None and minlen > 0.0:
                newinterval = {}
            for key2, monomer in S.imer.items():
                cropped_seq = False
                for key3 in monomer.chains:
                    if key3 in intervals[key]:
                        if insprot is not None and minlen > 0.0:
                            newinterval[key3] = intervals[key][key3].deepcopy()
                            newinterval[key3].tags[
                                'description'] += ' - Uniprot threshold'
                            newinterval[key3].subint = []
                            unilbl = ' uniprot chains included: '
                            for unicode, uniintervals in intervals[key][
                                    key3].tags['uniprot'].items():
                                uniseq = uniprotset[unicode].imer['1']
                                if 100 * uniintervals.n_elements(
                                ) / uniseq.length() >= minlen:
                                    newinterval[key3] = newinterval[
                                        key3].union(
                                            intervals[key][key3].intersection(
                                                uniintervals))
                                    unilbl += unicode + '|'
                            if cropped_seq is False:
                                monomer = cop.crop_seq(monomer,
                                                       newinterval[key3],
                                                       targetlbl + unilbl,
                                                       terms=args.terminals)
                                cropped_seq = True
                        else:
                            if cropped_seq is False:
                                monomer = cop.crop_seq(monomer,
                                                       intervals[key][key3],
                                                       targetlbl,
                                                       terms=args.terminals)
                                cropped_seq = True
                        if newS.imer[key2] != monomer:
                            newS.imer[key2] = monomer.deepcopy()
                    else:
                        logger.warning(
                            'Chain-name ' + key + '_' + str(key3) +
                            ' not found in database. Cropping not performed.')

                monomer.update_cropsheader()

                hf = '_' + key2 if args.individual is True else ''
                ifx = infixlbl["croprenum"] if cropped_seq is True else ''
                fout = (key + hf + ifx +
                        os.path.splitext(os.path.basename(inseq))[1])
                outseq = outpathgen(outdir,
                                    subdir=key,
                                    filename=fout,
                                    mksubdir=True)
                monomer.dump(outseq)
                if monomer.cropmap is not None:
                    fout = key + hf + infixlbl["croprenum"] + '.cropmap'
                    outmap = outpathgen(outdir, subdir=key, filename=fout)
                    monomer.dumpmap(outmap)

            cropped_str = cop.crop_pdb(strset2[key], newS, original_id=True)
            fout = key + infixlbl["crop"] + os.path.splitext(instr)[1]
            outstr = outpathgen(outdir,
                                subdir=key,
                                filename=fout,
                                mksubdir=True)
            cropped_str.write_minimal_pdb(outstr)

            cropped_str2 = cop.crop_pdb(strset2[key], newS, original_id=False)

            fout = key + infixlbl["croprenum"] + os.path.splitext(instr)[1]
            outstr = outpathgen(outdir,
                                subdir=key,
                                filename=fout,
                                mksubdir=True)
            cropped_str2.write_minimal_pdb(outstr)
        else:
            logger.warning('PDB-ID ' + key.upper() +
                           ' not found in database. Cropping not performed.')
            for key2, monomer in newS.imer.items():
                hf = '_' + key2 if args.individual is True else ''
                fout = key + hf + os.path.splitext(os.path.basename(inseq))[1]
                outseq = outpathgen(outdir,
                                    subdir=key,
                                    filename=fout,
                                    mksubdir=True)
                monomer.dump(outseq)

    # FINISH
    logger.info('Done' + os.linesep)

    return
Exemplo n.º 3
0
def main():
    """Renumber a structure file in agreement with the residue positions in the sequence file corresponding to that structure.

    Non-polymer elements are numbered starting right after the final (TER) residue.
    IMPORTANT: If the input sequence and the input structure files are not from the
    same source (e.g. RCSB PDB) a source conflict might occur making the
    renumbering operation unsuccessful even if the program does not crash.

    """
    # INITIALISE AND PARSE ARGUMENTS FROM COMMAND LINE
    parser = create_argument_parser()
    args = parser.parse_args()

    global logger
    logger = ccl.crops_logger(level="info")
    logger.info(ccl._welcome())

    inseq = check_path(args.input_seqpath[0], 'file')
    instr = check_path(args.input_strpath[0])

    if args.outdir is None:
        outdir = check_path(os.path.dirname(inseq), 'dir')
    else:
        outdir = check_path(os.path.join(args.outdir[0], ''), 'dir')
    infixlbl = ".crops.seq"

    # PARSE INPUT FILES
    logger.info('Parsing sequence file ' + inseq)
    if args.preselect is not None:
        subset = set(args.preselect)
    else:
        subset = None
    seqset = cin.parseseqfile(seq_input=inseq, inset=subset)
    logger.info('Done')

    logger.info('Parsing structure file ' + instr)
    strset, fileset = cin.parsestrfile(instr)
    logger.info('Done')

    # MAIN OPERATION / PRINT OUT RESULTS WITHIN
    logger.info('Renumbering structure(s)...')
    for pdbid, structure in strset.items():
        found = False
        for seqname in seqset:
            if ((seqname in pdbid) or (len(seqset) == 1 and len(strset) == 1)):
                finalid = seqname
                try:
                    newstructure = cop.renumber_pdb(seqset[seqname], structure)
                except (AttributeError, IndexError) as e:
                    logger.warning(
                        'Something has gone wrong during renumbering:\n{}'.
                        format(e))
                    if args.force_alignment:
                        logger.info('Attempting Needleman-Wunsch...')
                        newstructure = cop.renumber_pdb_needleman(
                            seqset[seqname], structure)
                    else:
                        logger.critical(
                            'Unable to renumber the structure, exiting now. '
                            'Try again with -f option to force the alignment.')
                        return

                fout = finalid + infixlbl + os.path.splitext(instr)[1]
                outstr = outpathgen(outdir,
                                    subdir=finalid,
                                    filename=fout,
                                    mksubdir=True)
                newstructure.write_minimal_pdb(outstr)
                found = True
        if found is False:
            logger.warning("Identifier '" + pdbid +
                           "' not found in sequence input.")

    # FINISH
    logger.info('Done' + os.linesep)

    return
def main():
    parser = create_argument_parser()
    args = parser.parse_args()

    global logger
    logger = pcl.pisacov_logger(level="info")
    welcomemsg, starttime = pcl.welcome(command=__script__)
    logger.info(welcomemsg)

    # PARSE CONFIGURATION FILE:
    invals = pco._initialise_inputs()

    invals['INSEQ'] = None
    invals['INSTR'] = None
    invals['ALTDB'] = None
    invals['OUTROOT'] = None
    invals['OUTCSVPATH'] = None
    invals['UPTHRESHOLD'] = None

    # READ INPUT ARGUMENTS
    invals['INSEQ'] = ppaths.check_path(args.seqpath[0], 'file')
    invals['INSTR'] = ppaths.check_path(args.crystalpath[0], 'file')

    if args.hhblits_arguments is not None:
        invals['HHBLITS_PARAMETERS'] = pco._check_hhparams(
            args.hhblits_arguments)
    else:
        pass

    if args.uniprot_threshold is not None:
        try:
            invals['UPTHRESHOLD'] = float(args.uniprot_threshold[0])
        except ValueError:
            logger.critical('Uniprot threshold given not valid.')
        if invals['UNICLUST_FASTA_PATH'] is None:
            invals['UNICLUST_FASTA_PATH'] = pco._uniurl
    else:
        pass

    if args.skip_conpred is True:
        skipexec = True
        if (args.hhblits_arguments is not None
                or args.uniprot_threshold is not None):
            logger.info(
                'HHblits, UniProt threshold parameters given bypassed by --skip_conpred'
            )
    else:
        skipexec = False
    cropping = args.remove_insertions
    scoring = [cropping, not cropping]

    if args.outdir is None:
        invals['OUTROOT'] = ppaths.check_path(os.path.dirname(invals['INSEQ']))
    else:
        invals['OUTROOT'] = ppaths.check_path(os.path.join(args.outdir[0], ''))
    ppaths.mdir(invals['OUTROOT'])

    invals['OUTCSVPATH'] = []
    if args.collection_file is None:
        invals['OUTCSVPATH'].append(
            ppaths.check_path(
                os.path.join(invals['OUTROOT'],
                             ("evcovsignal" + os.extsep + "cropped" +
                              os.extsep + "pisacov" + os.extsep + "csv"))))
        invals['OUTCSVPATH'].append(
            ppaths.check_path(
                os.path.join(invals['OUTROOT'],
                             ("evcovsignal" + os.extsep + "full" + os.extsep +
                              "pisacov" + os.extsep + "csv"))))
    else:
        if cropping is True:
            invals['OUTCSVPATH'].append(
                ppaths.check_path(args.collection_file[0]))
            invals['OUTCSVPATH'].append(
                ppaths.check_path(
                    os.path.splitext(args.collection_file[0])[0] + os.extsep +
                    'full' + os.extsep +
                    os.path.splitext(args.collection_file[0])[1]))
        else:
            invals['OUTCSVPATH'].append(None)
            invals['OUTCSVPATH'].append(
                ppaths.check_path(args.collection_file[0]))

    # Define formats used
    sources = pco._sources()

    # Parse sequence and structure files
    logger.info('Parsing sequence file...')
    seqs = cps.parseseqfile(invals['INSEQ'])

    logger.info('Parsing structure file...')
    strs, filestrs = cps.parsestrfile(invals['INSTR'])

    if len(seqs) == 1 or len(strs) == 1:
        if len(seqs) == 1:
            for key in seqs:
                pdbid = key
        elif len(seqs) > 1 and len(strs) == 1:
            for key in strs:
                for key2 in seqs:
                    if key.upper() == key2.upper():
                        pdbid = key.upper()
                    else:
                        if key2.upper() in key.upper():
                            pdbid = key2.upper()
    else:
        raise Exception(
            'More than one pdbid in sequence and/or structure set.')

    seq = seqs[pdbid]
    #structure = strs[pdbid]

    # CROPPING AND RENUMBERING
    outpdbdir = os.path.join(invals['OUTROOT'], pdbid, "")
    instrc = os.path.join(invals['OUTROOT'], pdbid,
                          os.path.basename(invals['INSTR']))

    fseq = {}
    fmsa = {}
    if skipexec is False:
        if cropping is True:
            logger.info('Cropping and renumbering sequences, ' +
                        'structures according to SIFTS database.')
            logger.info(pcl.running('CROPS-cropstr'))
            itime = datetime.datetime.now()
            psc.runcrops(invals['INSEQ'], invals['INSTR'],
                         invals['SIFTS_PATH'], invals['UPTHRESHOLD'],
                         invals['UNICLUST_FASTA_PATH'], invals['OUTROOT'])
            logger.info(pcl.running('CROPS-cropstr', done=itime))
        else:
            logger.info('Renumbering structure ' +
                        'according to position in sequence.')
            logger.info(pcl.running('CROPS-renumber'))
            itime = datetime.datetime.now()
            psc.renumcrops(invals['INSEQ'], invals['INSTR'], invals['OUTROOT'])
            logger.info(pcl.running('CROPS-renumber', done=itime))

        ppaths.mdir(outpdbdir)
        if cropping is False:
            psc.splitseqs(invals['INSEQ'], outpdbdir)
        copyfile(invals['INSTR'], instrc)

    for i, iseq in seq.imer.items():
        fiseq = pdbid + '_' + i + '.fasta'
        fseq[i] = os.path.join(invals['OUTROOT'], pdbid, fiseq)
        fiseq = pdbid + '_' + i + '.msa.aln'
        fmsa[i] = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', fiseq)
        if skipexec is False:
            iseq.dump(fseq[i])

    # Parse cropped sequences and maps
    if cropping is True:
        amap = {}
        fcropseq = {}
        fcropmsa = {}
        for i, iseq in seq.imer.items():
            fprefix = pdbid + '_' + i + '.crops.to_uniprot'
            fmap = os.path.join(invals['OUTROOT'], pdbid,
                                fprefix + os.extsep + 'cropmap')
            amap.update(cps.parsemapfile(fmap)[pdbid])
            fcropseq[i] = os.path.join(invals['OUTROOT'], pdbid,
                                       fprefix + os.extsep + 'fasta')
            fcropmsa[i] = os.path.join(
                invals['OUTROOT'], pdbid, 'hhblits',
                (fprefix + os.extsep + 'msa' + os.extsep + 'aln'))
            seq.set_cropmaps(amap, cropmain=True)

    # EXECUTION OF EXTERNAL PROGRAMS
    hhdir = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', '')
    dmpdir = os.path.join(invals['OUTROOT'], pdbid, 'dmp', '')
    pisadir = os.path.join(invals['OUTROOT'], pdbid, 'pisa', '')
    fstr = os.path.join(
        invals['OUTROOT'],
        (pdbid + os.extsep + 'crops' + os.extsep + 'seq' + os.extsep + 'pdb'))
    if cropping:
        fcropstr = os.path.join(
            invals['OUTROOT'], pdbid,
            (pdbid + os.extsep + 'crops' + os.extsep + 'oldids' + os.extsep +
             'to_uniprot' + os.path.splitext(invals['INSTR'])[1]))
    if skipexec is False:
        # MSA GENERATOR
        ppaths.mdir(hhdir)
        if invals['HHBLITS_PARAMETERS'] == ['3', '0.001', 'inf', '50', '99']:
            logger.info(
                'Generating Multiple Sequence Alignment using DeepMetaPSICOV default parameters... [AS RECOMMENDED]'
            )
        elif invals['HHBLITS_PARAMETERS'] == ['2', '0.001', '1000', '0', '90']:
            logger.info(
                'Generating Multiple Sequence Alignment using HHBlits default parameters...'
            )
        else:
            logger.info(
                'Generating Multiple Sequence Alignment using user-custom parameters...'
            )

        for i, iseq in seq.imer.items():
            sfile = fcropseq[i] if cropping is True else fseq[i]
            afile = fcropmsa[i] if cropping is True else fmsa[i]
            logger.info(pcl.running('HHBlits'))
            itime = datetime.datetime.now()
            themsa = psm.runhhblits(sfile, invals['HHBLITS_PARAMETERS'], hhdir)
            logger.info(pcl.running('HHBlits', done=itime))
            if cropping is True:
                iseq.cropmsa = themsa
                if iseq.ncrops() == 0:
                    iseq.msa = iseq.cropmsa
                    logger.info('    Cropped sequence ' + iseq.oligomer_id +
                                '_' + iseq.name +
                                ' is identical to original sequence.')
                    continue
                else:
                    pass
            else:
                iseq.msa = themsa

    # DEEP META PSICOV RUN
    ppaths.mdir(dmpdir)
    if skipexec is False:
        logger.info(
            'Generating contact prediction lists via DeepMetaPSICOV...')
        for i, iseq in seq.imer.items():
            sfile = fcropseq[i] if cropping is True else fseq[i]
            afile = fcropmsa[i] if cropping is True else fmsa[i]
            nsfile = os.path.join(dmpdir, os.path.basename(sfile))
            if sfile != nsfile:
                copyfile(sfile, nsfile)
            logger.info(pcl.running('DeepMetaPSICOV'))
            itime = datetime.datetime.now()
            psd.rundmp(nsfile, afile, dmpdir)
            logger.info(pcl.running('DeepMetaPSICOV', done=itime))

    # INTERFACE GENERATION, PISA
    ppaths.mdir(pisadir)
    if skipexec is False:
        logger.info('Generating interface files via PISA...')
        sfile = fcropstr if cropping is True else fstr
        logger.info(pcl.running('PISA'))
        itime = datetime.datetime.now()
        iflist = psp.runpisa(sfile, pisadir, sessionid=pdbid)
        logger.info(pcl.running('PISA', done=itime))

    endmsg = pcl.ok(starttime, command=__script__)
    logger.info(endmsg)

    return