def msafilesgen(inpath_a3m): """ Convert a3m format alignment to "jones" format (.aln) and print msa coverage graph Parameters ---------- inpath_a3m : str Multiple Sequence Alignment (.a3m) path Returns ------- parsemsa : msa ConKit parsed MSA. outpath_aln : str MSA (jones format) file path. """ outpath_aln = os.path.join(os.path.splitext(inpath_a3m)[0] + ".aln") biotag = os.path.splitext(os.path.splitext(inpath_a3m)[0])[1] msacoveragefile = pdbid( ) + biotag + ".msa.coverage.png" if biotag == ".bio" else pdbid( ) + ".msa.coverage.png" msacoveragepath = os.path.join(output_dir(), msacoveragefile) parsemsa = ckio.read(inpath_a3m, 'a3m') ckio.write(outpath_aln, 'jones', parsemsa) ckplot.SequenceCoverageFigure(parsemsa, file_name=msacoveragepath) return parsemsa, outpath_aln
def write(infile, ftype, indata, ck=False): """ :param infile: Path to input file. :type infile: str :param ftype: One of 'psicov', 'ccmpred', 'fasta', 'pdb', 'a3m', 'jones', 'xml'. :type ftype: str :param ck: Open alternative conkit version instead of default, defaults to False. :type ck: bool, optional :return: Parsed file (and, for 'pdb', list of filenames). :rtype: One or two of list[str], :class:`~crops.elements.sequences.sequence`, :class:`~conkit.core.sequence.Sequence`, """ if (ftype.lower() not in _ftypelist() or isinstance(ftype, str) is not True): logging.critical('Specified type not valid.') raise ValueError if ck is True and ftype.lower() != 'xml': output = ckio.write(infile, ftype.lower(), hyerarchy=indata) else: if ftype.lower() == 'psicov': pass if ftype.lower() == 'ccmpred': pass elif ftype.lower() == 'fasta': output = cps.parseseqfile(infile) elif ftype.lower() == 'pdb': output1, output2 = cps.parsestrfile(infile) return output1, output2 elif ftype.lower() == 'a3m' or 'jones': output = ckio.read(infile, ftype.lower()) elif ftype.lower() == 'xml': output = ET.parse(infile) return output
def msafilesgen(inpath_a3m): """Convert a3m format alignment to "jones" format (.aln) and print msa coverage graph. :param inpath_a3m: Multiple Sequence Alignment (.a3m) path :type inpath_a3m: str :return: ConKit parsed MSA :rtype: :obj:`~conkit.core.sequencefile.SequenceFile` """ parsedmsa = ckio.read(inpath_a3m, 'a3m') # Convert to 'jones' format outpath_aln = os.path.join(os.path.splitext(inpath_a3m)[0], ".aln") ckio.write(outpath_aln, 'jones', parsedmsa) # Plot Coverage msacoveragepath = os.path.join( os.path.splitext(inpath_a3m)[0], ".coverage.png") fig = ckplot.SequenceCoverageFigure(parsedmsa) fig.savefig(msacoveragepath, overwrite=True) neff = parsedmsa.meff return neff
def crop_fasta(fastaseq, outdir=output_tmpdir()): """ FASTA SEQUENCE CROPPING Parameters ---------- seqpath : ConKit Sequence Source fasta sequence outdir : str, optional Directory where results biological sequence will be printed out. The default is output_tmpdir(). Returns ------- bioseq : ConKit Sequence Cropped (biological) sequence newseqpath : str Sequence path (bio.fasta file) """ # Obtain new chain ends (residue number) fastaends = biochain_ends('fasta') # Check that the sequence is consistent with the limits retrieved from the database if fastaends[1] - fastaends[0] + 1 > fastaseq.seq_len: isitbio = False printout( 'WARNING: The biological sequence limits include a section greater than the input sequence.', errorlog=True) printout(' Skipping cropping. Returning input values.', errorlog=True, extraline=True) bioseq = fastaseq newseqfile = pdbid() + '.fasta' newseqpath = os.path.join(outdir, newseqfile) elif fastaends[1] - fastaends[0] + 1 == fastaseq.seq_len: isitbio = True printout( ' Biological and input sequences have the same length. Skipping cropping. Returning input values.', extraline=True) bioseq = fastaseq newseqfile = pdbid() + '.bio.fasta' newseqpath = os.path.join(outdir, newseqfile) ckio.write(newseqpath, "fasta", hierarchy=bioseq) else: if fastaends[0] > fastaseq.seq_len: isitbio = False printout( 'WARNING: The sequence upper limit imported from the database is higher than the upper limit from the fasta file.', errorlog=True) #LOGGING printout(' Skipping cropping. Returning input values.', errorlog=True, extraline=True) bioseq = fastaseq newseqfile = pdbid() + '.fasta' newseqpath = os.path.join(outdir, newseqfile) else: isitbio = True # Append new info to sequence newid = fastaseq.id newid = newid + "|NO_CLONING_ARTIFACTS" # Create new sequence newseq = fastaseq.seq[fastaends[0] - 1:fastaends[1] - 1] bioseq = ckc.Sequence(newid, newseq) # Write new sequence to file #locpdbid=os.path.splitext(os.path.basename(PDB_PATH))[0] #outdir = os.path.join(OUTPUT_DIR, locpdbid,"") newseqfile = pdbid() + '.bio.fasta' newseqpath = os.path.join(outdir, newseqfile) ckio.write(newseqpath, "fasta", hierarchy=bioseq) return bioseq, newseqpath, isitbio