예제 #1
0
def msafilesgen(inpath_a3m):
    """
    Convert a3m format alignment to "jones" format (.aln) and print msa coverage graph

    Parameters
    ----------
    inpath_a3m : str
        Multiple Sequence Alignment (.a3m) path

    Returns
    -------
    parsemsa : msa
        ConKit parsed MSA.
    outpath_aln : str
        MSA (jones format) file path.

    """
    outpath_aln = os.path.join(os.path.splitext(inpath_a3m)[0] + ".aln")

    biotag = os.path.splitext(os.path.splitext(inpath_a3m)[0])[1]

    msacoveragefile = pdbid(
    ) + biotag + ".msa.coverage.png" if biotag == ".bio" else pdbid(
    ) + ".msa.coverage.png"
    msacoveragepath = os.path.join(output_dir(), msacoveragefile)

    parsemsa = ckio.read(inpath_a3m, 'a3m')
    ckio.write(outpath_aln, 'jones', parsemsa)

    ckplot.SequenceCoverageFigure(parsemsa, file_name=msacoveragepath)

    return parsemsa, outpath_aln
예제 #2
0
def write(infile, ftype, indata, ck=False):
    """

    :param infile: Path to input file.
    :type infile: str
    :param ftype: One of 'psicov', 'ccmpred', 'fasta', 'pdb', 'a3m', 'jones', 'xml'.
    :type ftype: str
    :param ck: Open alternative conkit version instead of default, defaults to False.
    :type ck: bool, optional
    :return: Parsed file (and, for 'pdb', list of filenames).
    :rtype: One or two of list[str], :class:`~crops.elements.sequences.sequence`, :class:`~conkit.core.sequence.Sequence`,

    """
    if (ftype.lower() not in _ftypelist() or
            isinstance(ftype, str) is not True):
        logging.critical('Specified type not valid.')
        raise ValueError

    if ck is True and ftype.lower() != 'xml':
        output = ckio.write(infile, ftype.lower(), hyerarchy=indata)
    else:
        if ftype.lower() == 'psicov':
            pass
        if ftype.lower() == 'ccmpred':
            pass
        elif ftype.lower() == 'fasta':
            output = cps.parseseqfile(infile)
        elif ftype.lower() == 'pdb':
            output1, output2 = cps.parsestrfile(infile)
            return output1, output2
        elif ftype.lower() == 'a3m' or 'jones':
            output = ckio.read(infile, ftype.lower())
        elif ftype.lower() == 'xml':
            output = ET.parse(infile)
    return output
예제 #3
0
def msafilesgen(inpath_a3m):
    """Convert a3m format alignment to "jones" format (.aln) and print msa coverage graph.

    :param inpath_a3m: Multiple Sequence Alignment (.a3m) path
    :type inpath_a3m: str
    :return: ConKit parsed MSA
    :rtype: :obj:`~conkit.core.sequencefile.SequenceFile`

    """
    parsedmsa = ckio.read(inpath_a3m, 'a3m')

    # Convert to 'jones' format
    outpath_aln = os.path.join(os.path.splitext(inpath_a3m)[0], ".aln")
    ckio.write(outpath_aln, 'jones', parsedmsa)

    # Plot Coverage
    msacoveragepath = os.path.join(
        os.path.splitext(inpath_a3m)[0], ".coverage.png")
    fig = ckplot.SequenceCoverageFigure(parsedmsa)
    fig.savefig(msacoveragepath, overwrite=True)

    neff = parsedmsa.meff

    return neff
예제 #4
0
def crop_fasta(fastaseq, outdir=output_tmpdir()):
    """
    FASTA SEQUENCE CROPPING

    Parameters
    ----------
    seqpath : ConKit Sequence
        Source fasta sequence
    outdir : str, optional
        Directory where results biological sequence will be printed out. The default is output_tmpdir().

    Returns
    -------
    bioseq : ConKit Sequence
        Cropped (biological) sequence
    newseqpath : str
        Sequence path (bio.fasta file)

    """
    # Obtain new chain ends (residue number)
    fastaends = biochain_ends('fasta')

    # Check that the sequence is consistent with the limits retrieved from the database
    if fastaends[1] - fastaends[0] + 1 > fastaseq.seq_len:
        isitbio = False
        printout(
            'WARNING: The biological sequence limits include a section greater than the input sequence.',
            errorlog=True)
        printout('         Skipping cropping. Returning input values.',
                 errorlog=True,
                 extraline=True)
        bioseq = fastaseq
        newseqfile = pdbid() + '.fasta'
        newseqpath = os.path.join(outdir, newseqfile)
    elif fastaends[1] - fastaends[0] + 1 == fastaseq.seq_len:
        isitbio = True
        printout(
            '         Biological and input sequences have the same length. Skipping cropping. Returning input values.',
            extraline=True)
        bioseq = fastaseq
        newseqfile = pdbid() + '.bio.fasta'
        newseqpath = os.path.join(outdir, newseqfile)
        ckio.write(newseqpath, "fasta", hierarchy=bioseq)
    else:
        if fastaends[0] > fastaseq.seq_len:
            isitbio = False
            printout(
                'WARNING: The sequence upper limit imported from the database is higher than the upper limit from the fasta file.',
                errorlog=True)  #LOGGING
            printout('         Skipping cropping. Returning input values.',
                     errorlog=True,
                     extraline=True)
            bioseq = fastaseq
            newseqfile = pdbid() + '.fasta'
            newseqpath = os.path.join(outdir, newseqfile)
        else:
            isitbio = True
            # Append new info to sequence
            newid = fastaseq.id
            newid = newid + "|NO_CLONING_ARTIFACTS"

            # Create new sequence
            newseq = fastaseq.seq[fastaends[0] - 1:fastaends[1] - 1]
            bioseq = ckc.Sequence(newid, newseq)

            # Write new sequence to file
            #locpdbid=os.path.splitext(os.path.basename(PDB_PATH))[0]
            #outdir = os.path.join(OUTPUT_DIR, locpdbid,"")

            newseqfile = pdbid() + '.bio.fasta'
            newseqpath = os.path.join(outdir, newseqfile)

            ckio.write(newseqpath, "fasta", hierarchy=bioseq)

    return bioseq, newseqpath, isitbio