def test_format_contigs_denovo(): # test with a custom fasta filename = sequana_data("test_fasta.fasta") contigs = FastA(filename) with TempFile(suffix='.fasta') as fh: contigs.format_contigs_denovo(fh.name) contigs.names contigs.lengths contigs.comments
def __init__(self, directory=".", prefix=""): self.prefix = prefix self.directory = directory self.sample_name = "undefined" # low quality isoforms filename = "all.polished_lq.fastq" self.lq_isoforms = self.get_file(filename) if self.lq_isoforms: logger.info("Reading {}".format(filename)) self.lq_sequence = FastQ(self.lq_isoforms) # high quality isoforms filename = "all.polished_hq.fastq" self.hq_isoforms = self.get_file(filename) if self.hq_isoforms: logger.info("Reading {}".format(filename)) self.hq_sequence = FastQ(self.hq_isoforms) # General info filename = "file.csv" self.csv = self.get_file(filename) if self.csv: logger.info("Reading {}".format(filename)) self.data = pd.read_csv(self.csv) # CCS fasta sequence #self.ccs = self.get_file("-ccs.tar.gz") filename = "ccs.fasta" self.ccs = self.get_file(filename, noprefix=True) if self.ccs: logger.info("Reading {}".format(filename)) self.ccs = FastA(self.ccs)
def find_motif_fasta(self, filename, motif, window=200, local_threshold=None, global_threshold=None): from sequana import FastA data = FastA(filename) N = len(data) from easydev import Progress pb = Progress(N) df = { "query_name": [], "hit": [], "length": [], "start": [], "end": [] } for i, item in enumerate(data): X1, S = self.find_motif_from_sequence(item.sequence, motif, window=window, local_threshold=local_threshold ) if S >= self.global_threshold: df['query_name'].append(item.name) df['start'].append(0) df['end'].append(len(item.sequence)) df['length'].append(len(item.sequence)) df['hit'].append(S) pb.animate(i+1) df = pd.DataFrame(df) return df
def main(args=None): if args is None: args = sys.argv[:] user_options = Options(prog="sequana") # If --help or no options provided, show the help if len(args) == 1: user_options.parse_args(["prog", "--help"]) else: options = user_options.parse_args(args[1:]) reference = options.reference if options.file1 and options.file2: fastq = "%s %s" % (options.file1, options.file2) elif options.file1 and not options.file2: fastq = "%s" % (options.file1) elif options.file1 is None: raise ValueError("--file1 must be used") from sequana import FastQ from sequana import FastA S = 0 for this in FastQ(options.file1): S += len(this['sequence']) if options.file2: for this in FastQ(options.file2): S += len(this['sequence']) ref = FastA(options.reference) coverage = float(S) / len(ref.sequences[0]) print('Theoretical Depth of Coverage : %s' % coverage) params = {"reference": reference, "fastq": fastq, "thread": options.thread} # indexing shellcmd("bwa index %(reference)s " % params) cmd = "samtools faidx %(reference)s " % params # mapping cmd = "bwa mem -M " # mark shorter split read as secondary; -M is not compulsary but recommended if options.pacbio: cmd += "-x pacbio " cmd += r" -t %(thread)s -R @RG\\tID:1\\tSM:1\\tPL:illumina -T 30 %(reference)s %(fastq)s " # Samtools options: # S:ignore input format # h:include header # b:bam output if options.sambamba is False: cmd += "| samtools view -Sbh | " # sorting BAM cmd += "samtools sort -@ %(thread)s -o %(reference)s.sorted.bam -" shellcmd(cmd % params) else: # FIXME use sambamba for the view as well cmd += "| samtools view -Sbu - | sambamba sort /dev/stdin -o %(reference)s.sorted.bam -t %(thread)s --tmpdir=./tmp " % params shellcmd(cmd % params)
def test_fasta_fwd_rev_to_columns(): a1 = sequana_data("adapters_PCRFree_fwd.fa") a2 = sequana_data("adapters_PCRFree_rev.fa") f1 = FastA(a1) f2 = FastA(a2) assert f1 == f1 assert f1 != f2 assert len(f1) == 49 assert len(f2) == 49 with TempFile() as fh: adapters.fasta_fwd_rev_to_columns(a1, a2, fh.name) with TempFile() as fh: adapters.fasta_fwd_rev_to_columns(a1, None, output_filename=fh.name) with TempFile() as fh: adapters.fasta_fwd_rev_to_columns(a1, a2) with TempFile() as fh: adapters.fasta_fwd_rev_to_columns(a1, None)
def test_fasta_fwd_rev_to_columns(): a1 = sequana_data("NEXTFlex48_DNA_fwd.fa") a2 = sequana_data("NEXTFlex48_DNA_rev.fa") a3 = sequana_data("NEXTFlex48_DNA_revcomp.fa") f1 = FastA(a1) f2 = FastA(a2) f3 = FastA(a3) assert f1 == f1 assert f1 != f2 assert len(f1) == 49 assert len(f2) == 49 assert len(f3) == 49 with TempFile() as fh: adapters.fasta_fwd_rev_to_columns(a1, a2, fh.name) with TempFile() as fh: adapters.fasta_fwd_rev_to_columns(a1, None, output_filename=fh.name) with TempFile() as fh: adapters.fasta_fwd_rev_to_columns(a1, a2) with TempFile() as fh: adapters.fasta_fwd_rev_to_columns(a1, None)
def add_locus_in_fasta(self, fasta, output_file): """ Add locus of annotation file in description line of fasta file. If fasta file and genbank file do not have the same names. :param str fasta: input fasta file where you want to add locus. :param str output_file: output file. FIXME: fasta is already known if provided in the init """ fasta_record = FastA(fasta) ids_list = self._get_seq_ids() # check if both files have same number of contigs if len(fasta_record) != len(ids_list): print("fasta and annotation files don't have the same number of " "contigs. Found {} and {}".format(len(fasta_record), len(ids_list))) sys.exit(1) # check if directory exist output_dir = os.path.dirname(output_file) try: if not os.path.exists(output_dir): os.makedirs(output_dir) except FileNotFoundError: pass if sorted(fasta_record.names) == sorted(ids_list): logger.info("Files have same sequence id.") if os.path.isfile(output_file): os.remove(output_file) os.symlink(os.path.realpath(fasta), output_file) return else: logger.info( "fasta and GFF seem to have different IDs. Creating a" "new coherent fasta file assuming the chromsome names appear " "in the same order in the fasta and gff") with open(output_file, "w") as fp: # write fasta with seqid of annotation file for n in range(len(fasta_record)): seq_id = ">{0} {1}\n".format(ids_list[n], fasta_record.names[n]) seq = fasta_record.sequences[n] sequence = "\n".join([ seq[i:min(i + 80, len(seq))] for i in range(0, len(seq), 80) ]) + "\n" contigs = seq_id + sequence fp.write(contigs)
def bar_plot_contigs_length(self): # show length of N contigs as compare to length of the reference fref = FastA(self.reference) Nref = len(fref.sequences) N = len(self.fasta) pylab.clf() pylab.bar(range(0, N, int(pylab.ceil(N / Nref))), sorted(fref.lengths), width=Nref / 1.1, label="Plasmodium chromosomes") pylab.bar(range(0, N), sorted(self.fasta.lengths), width=1, label="canu {} contigs".format(N)) pylab.legend()
def __init__(self, filename, reference=None, bamfile=None, mode="canu"): """ minimap2 -x map-pb reference filename -a > temp.sam bioconvert sam2bam temp.sam temp.bam """ self.filename = filename self.fasta = FastA(filename) self.mode = mode self._df = None if bamfile: self.bam = BAM(bamfile) else: self.bam = None self.reference = reference
def add_locus_in_fasta(self, fasta, output_file): """ Add locus of annotation file in description line of fasta file. If fasta file and genbank file do not have the same names. :param str fasta: input fasta file where you want to add locus. :param str output_file: output file. """ fasta_record = FastA(fasta) ids_list = self._get_seq_ids() # check if both files have same number of contigs if len(fasta_record) != len(ids_list): print("fasta and annotation files don't have the same number of " "contigs.") sys.exit(1) # check if directory exist output_dir = os.path.dirname(output_file) try: if not os.path.exists(output_dir): os.makedirs(output_dir) except FileNotFoundError: pass if fasta_record.names[0] == ids_list[0]: print("Files have same sequence id.") if os.path.isfile(output_file): os.remove(output_file) os.symlink(os.path.realpath(fasta), output_file) return with open(output_file, "w") as fp: # write fasta with seqid of annotation file for n in range(len(fasta_record)): seq_id = ">{0} {1}\n".format(ids_list[n], fasta_record.names[n]) seq = fasta_record.sequences[n] sequence = "\n".join([ seq[i:min(i + 80, len(seq))] for i in range(0, len(seq), 80) ]) + "\n" contigs = seq_id + sequence fp.write(contigs)
def __init__(self, directory=".", prefix="job-*"): self.prefix = prefix self.directory = directory # low quality isoforms self.lq_isoforms = self.get_file("lq_isoforms.fastq") if self.lq_isoforms: self.lq_sequence = FastQ(self.lq_isoforms) # high quality isoforms self.hq_isoforms = self.get_file("hq_isoforms.fastq") if self.hq_isoforms: self.hq_sequence = FastQ(self.hq_isoforms) # General info self.csv = self.get_file("-file.csv") if self.csv: self.data = pd.read_csv(self.csv) # CCS fasta sequence #self.ccs = self.get_file("-ccs.tar.gz") self.ccs = self.get_file("ccs.fasta", noprefix=True) if self.ccs: self.ccs = FastA(self.ccs)
def get_fasta_stats(filename, sample=1e16): from sequana import FastA ff = FastA(filename) stats = ff.get_stats() return stats
def __init__(self, filename, shift=4): from sequana import FastA self.SIRV = FastA(filename) self.shift = 5
def __init__(self, filename): self.filename = filename self.fasta = FastA(filename)
def test_others(): filename = sequana_data("test_fasta.fasta") ff = FastA(filename) assert len(ff) == 16 assert len(ff.comments) == 16 assert len(ff.names) == 16 assert len(ff.sequences) == 16 assert is_fasta(filename) == True ff.get_lengths_as_dict() with TempFile(suffix='.fasta') as fh: ff.select_random_reads(4, output_filename=fh.name) ff.select_random_reads([1, 2, 3], output_filename=fh.name) ff.select_random_reads({1, 2, 3}, output_filename=fh.name) assert ff.get_stats()['N'] == 16 assert ff.get_stats()['mean_length'] > 454 with TempFile(suffix='.fasta') as fh: ff.reverse_and_save(fh.name) ff.to_fasta(fh.name) ff.to_igv_chrom_size(fh.name)
def test_fasta_filtering(): filename = sequana_data("test_fasta_filtering.fa") ff = FastA(filename) with TempFile(suffix='.fasta') as fh: ff.to_fasta(fh.name) ff.save_ctg_to_fasta("A", fh.name) with TempFile(suffix='.fasta') as fh: ff.filter(fh.name, names_to_exclude=["A", "B"]) reader = FastA(fh.name) assert set(reader.names) == set(["C", "D"]) ff = FastA(filename) with TempFile(suffix='.fasta') as fh: ff.filter(fh.name, names_to_keep=[ "A", ]) reader = FastA(fh.name) assert set(reader.names) == set(['A'])