Exemplo n.º 1
0
def test_format_contigs_denovo():
    # test with a custom fasta
    filename = sequana_data("test_fasta.fasta")
    contigs = FastA(filename)
    with TempFile(suffix='.fasta') as fh:
        contigs.format_contigs_denovo(fh.name)
    contigs.names
    contigs.lengths
    contigs.comments
Exemplo n.º 2
0
    def __init__(self, directory=".", prefix=""):
        self.prefix = prefix
        self.directory = directory
        self.sample_name = "undefined"

        # low quality isoforms
        filename = "all.polished_lq.fastq"
        self.lq_isoforms = self.get_file(filename)
        if self.lq_isoforms:
            logger.info("Reading {}".format(filename))
            self.lq_sequence = FastQ(self.lq_isoforms)

        # high quality isoforms
        filename = "all.polished_hq.fastq"
        self.hq_isoforms = self.get_file(filename)
        if self.hq_isoforms:
            logger.info("Reading {}".format(filename))
            self.hq_sequence = FastQ(self.hq_isoforms)

        # General info
        filename = "file.csv"
        self.csv = self.get_file(filename)
        if self.csv:
            logger.info("Reading {}".format(filename))
            self.data = pd.read_csv(self.csv)

        # CCS fasta sequence
        #self.ccs = self.get_file("-ccs.tar.gz")
        filename = "ccs.fasta"
        self.ccs = self.get_file(filename, noprefix=True)
        if self.ccs:
            logger.info("Reading {}".format(filename))
            self.ccs = FastA(self.ccs)
Exemplo n.º 3
0
 def find_motif_fasta(self, filename, motif, window=200,
         local_threshold=None, global_threshold=None):
     from sequana import FastA
     data = FastA(filename)
     N = len(data)
     from easydev import Progress
     pb = Progress(N)
     df = {
         "query_name": [],
         "hit": [],
         "length": [],
         "start": [],
         "end": []
     }
     for i, item in enumerate(data):
         X1, S = self.find_motif_from_sequence(item.sequence, motif,
                     window=window, local_threshold=local_threshold
                     )
         if S >= self.global_threshold:
             df['query_name'].append(item.name)
             df['start'].append(0)
             df['end'].append(len(item.sequence))
             df['length'].append(len(item.sequence))
             df['hit'].append(S)
         pb.animate(i+1)
     df = pd.DataFrame(df)
     return df
Exemplo n.º 4
0
def main(args=None):

    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="sequana")

    # If --help or no options provided, show the help
    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    reference = options.reference
    if options.file1 and options.file2:
        fastq = "%s %s" % (options.file1, options.file2)
    elif options.file1 and not options.file2:
        fastq = "%s" % (options.file1)
    elif options.file1 is None:
        raise ValueError("--file1 must be used")

    from sequana import FastQ
    from sequana import FastA
    S = 0
    for this in FastQ(options.file1):
        S += len(this['sequence'])
    if options.file2:
        for this in FastQ(options.file2):
            S += len(this['sequence'])
    ref = FastA(options.reference)
    coverage = float(S) / len(ref.sequences[0])
    print('Theoretical Depth of Coverage : %s' % coverage)

    params = {"reference": reference, "fastq": fastq, "thread": options.thread}

    # indexing
    shellcmd("bwa index %(reference)s " % params)
    cmd = "samtools faidx %(reference)s " % params

    # mapping
    cmd = "bwa mem -M "  # mark shorter split read as secondary; -M is not compulsary but recommended
    if options.pacbio:
        cmd += "-x pacbio "
    cmd += r" -t %(thread)s -R @RG\\tID:1\\tSM:1\\tPL:illumina -T 30 %(reference)s %(fastq)s  "

    # Samtools options:
    #   S:ignore input format
    #   h:include header
    #   b:bam output
    if options.sambamba is False:
        cmd += "| samtools view -Sbh | "
        # sorting BAM
        cmd += "samtools sort -@ %(thread)s -o %(reference)s.sorted.bam -"
        shellcmd(cmd % params)
    else:
        # FIXME use sambamba for the view as well
        cmd += "| samtools view -Sbu - | sambamba sort /dev/stdin -o %(reference)s.sorted.bam -t %(thread)s  --tmpdir=./tmp  " % params
        shellcmd(cmd % params)
Exemplo n.º 5
0
def test_fasta_fwd_rev_to_columns():
    a1 = sequana_data("adapters_PCRFree_fwd.fa")
    a2 = sequana_data("adapters_PCRFree_rev.fa")
    f1 = FastA(a1)
    f2 = FastA(a2)
    assert f1 == f1
    assert f1 != f2
    assert len(f1) == 49
    assert len(f2) == 49

    with TempFile() as fh:
        adapters.fasta_fwd_rev_to_columns(a1, a2, fh.name)
    with TempFile() as fh:
        adapters.fasta_fwd_rev_to_columns(a1, None, output_filename=fh.name)
    with TempFile() as fh:
        adapters.fasta_fwd_rev_to_columns(a1, a2)
    with TempFile() as fh:
        adapters.fasta_fwd_rev_to_columns(a1, None)
Exemplo n.º 6
0
def test_fasta_fwd_rev_to_columns():
    a1 = sequana_data("NEXTFlex48_DNA_fwd.fa")
    a2 = sequana_data("NEXTFlex48_DNA_rev.fa")
    a3 = sequana_data("NEXTFlex48_DNA_revcomp.fa")
    f1 = FastA(a1)
    f2 = FastA(a2)
    f3 = FastA(a3)
    assert f1 == f1
    assert f1 != f2
    assert len(f1) == 49
    assert len(f2) == 49
    assert len(f3) == 49

    with TempFile() as fh:
        adapters.fasta_fwd_rev_to_columns(a1, a2, fh.name)
    with TempFile() as fh:
        adapters.fasta_fwd_rev_to_columns(a1, None, output_filename=fh.name)
    with TempFile() as fh:
        adapters.fasta_fwd_rev_to_columns(a1, a2)
    with TempFile() as fh:
        adapters.fasta_fwd_rev_to_columns(a1, None)
Exemplo n.º 7
0
    def add_locus_in_fasta(self, fasta, output_file):
        """ Add locus of annotation file in description line of fasta file. If
        fasta file and genbank file do not have the same names.

        :param str fasta: input fasta file where you want to add locus.
        :param str output_file: output file.

        FIXME: fasta is already known if provided in the init
        """
        fasta_record = FastA(fasta)
        ids_list = self._get_seq_ids()
        # check if both files have same number of contigs
        if len(fasta_record) != len(ids_list):
            print("fasta and annotation files don't have the same number of "
                  "contigs. Found {} and {}".format(len(fasta_record),
                                                    len(ids_list)))
            sys.exit(1)

        # check if directory exist
        output_dir = os.path.dirname(output_file)
        try:
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
        except FileNotFoundError:
            pass

        if sorted(fasta_record.names) == sorted(ids_list):
            logger.info("Files have same sequence id.")
            if os.path.isfile(output_file):
                os.remove(output_file)
            os.symlink(os.path.realpath(fasta), output_file)
            return
        else:
            logger.info(
                "fasta and GFF seem to have different IDs. Creating a"
                "new coherent fasta file assuming the chromsome names appear "
                "in the same order in the fasta and gff")

        with open(output_file, "w") as fp:
            # write fasta with seqid of annotation file
            for n in range(len(fasta_record)):
                seq_id = ">{0} {1}\n".format(ids_list[n],
                                             fasta_record.names[n])
                seq = fasta_record.sequences[n]
                sequence = "\n".join([
                    seq[i:min(i + 80, len(seq))]
                    for i in range(0, len(seq), 80)
                ]) + "\n"
                contigs = seq_id + sequence
                fp.write(contigs)
Exemplo n.º 8
0
 def bar_plot_contigs_length(self):
     # show length of N contigs as compare to length of the reference
     fref = FastA(self.reference)
     Nref = len(fref.sequences)
     N = len(self.fasta)
     pylab.clf()
     pylab.bar(range(0, N, int(pylab.ceil(N / Nref))),
               sorted(fref.lengths),
               width=Nref / 1.1,
               label="Plasmodium chromosomes")
     pylab.bar(range(0, N),
               sorted(self.fasta.lengths),
               width=1,
               label="canu {} contigs".format(N))
     pylab.legend()
Exemplo n.º 9
0
    def __init__(self, filename, reference=None, bamfile=None, mode="canu"):
        """


            minimap2 -x map-pb reference filename -a > temp.sam
            bioconvert sam2bam temp.sam temp.bam

        """
        self.filename = filename
        self.fasta = FastA(filename)
        self.mode = mode
        self._df = None
        if bamfile:
            self.bam = BAM(bamfile)
        else:
            self.bam = None
        self.reference = reference
Exemplo n.º 10
0
    def add_locus_in_fasta(self, fasta, output_file):
        """ Add locus of annotation file in description line of fasta file. If
        fasta file and genbank file do not have the same names.

        :param str fasta: input fasta file where you want to add locus.
        :param str output_file: output file.
        """
        fasta_record = FastA(fasta)
        ids_list = self._get_seq_ids()

        # check if both files have same number of contigs
        if len(fasta_record) != len(ids_list):
            print("fasta and annotation files don't have the same number of "
                  "contigs.")
            sys.exit(1)

        # check if directory exist
        output_dir = os.path.dirname(output_file)
        try:
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
        except FileNotFoundError:
            pass

        if fasta_record.names[0] == ids_list[0]:
            print("Files have same sequence id.")
            if os.path.isfile(output_file):
                os.remove(output_file)
            os.symlink(os.path.realpath(fasta), output_file)
            return

        with open(output_file, "w") as fp:
            # write fasta with seqid of annotation file
            for n in range(len(fasta_record)):
                seq_id = ">{0} {1}\n".format(ids_list[n],
                                             fasta_record.names[n])
                seq = fasta_record.sequences[n]
                sequence = "\n".join([
                    seq[i:min(i + 80, len(seq))]
                    for i in range(0, len(seq), 80)
                ]) + "\n"
                contigs = seq_id + sequence
                fp.write(contigs)
Exemplo n.º 11
0
    def __init__(self, directory=".", prefix="job-*"):
        self.prefix = prefix
        self.directory = directory

        # low quality isoforms
        self.lq_isoforms = self.get_file("lq_isoforms.fastq")
        if self.lq_isoforms:
            self.lq_sequence = FastQ(self.lq_isoforms)

        # high quality isoforms
        self.hq_isoforms = self.get_file("hq_isoforms.fastq")
        if self.hq_isoforms:
            self.hq_sequence = FastQ(self.hq_isoforms)

        # General info
        self.csv = self.get_file("-file.csv")
        if self.csv:
            self.data = pd.read_csv(self.csv)

        # CCS fasta sequence
        #self.ccs = self.get_file("-ccs.tar.gz")
        self.ccs = self.get_file("ccs.fasta", noprefix=True)
        if self.ccs:
            self.ccs = FastA(self.ccs)
Exemplo n.º 12
0
def get_fasta_stats(filename, sample=1e16):
    from sequana import FastA
    ff = FastA(filename)
    stats = ff.get_stats()
    return stats
Exemplo n.º 13
0
 def __init__(self, filename, shift=4):
     from sequana import FastA
     self.SIRV = FastA(filename)
     self.shift = 5
Exemplo n.º 14
0
 def __init__(self, filename):
     self.filename = filename
     self.fasta = FastA(filename)
Exemplo n.º 15
0
def test_others():
    filename = sequana_data("test_fasta.fasta")
    ff = FastA(filename)
    assert len(ff) == 16
    assert len(ff.comments) == 16
    assert len(ff.names) == 16
    assert len(ff.sequences) == 16
    assert is_fasta(filename) == True
    ff.get_lengths_as_dict()
    with TempFile(suffix='.fasta') as fh:
        ff.select_random_reads(4, output_filename=fh.name)
        ff.select_random_reads([1, 2, 3], output_filename=fh.name)
        ff.select_random_reads({1, 2, 3}, output_filename=fh.name)
    assert ff.get_stats()['N'] == 16
    assert ff.get_stats()['mean_length'] > 454
    with TempFile(suffix='.fasta') as fh:
        ff.reverse_and_save(fh.name)
        ff.to_fasta(fh.name)
        ff.to_igv_chrom_size(fh.name)
Exemplo n.º 16
0
def test_fasta_filtering():
    filename = sequana_data("test_fasta_filtering.fa")
    ff = FastA(filename)
    with TempFile(suffix='.fasta') as fh:
        ff.to_fasta(fh.name)
        ff.save_ctg_to_fasta("A", fh.name)

    with TempFile(suffix='.fasta') as fh:
        ff.filter(fh.name, names_to_exclude=["A", "B"])
        reader = FastA(fh.name)
        assert set(reader.names) == set(["C", "D"])

    ff = FastA(filename)
    with TempFile(suffix='.fasta') as fh:
        ff.filter(fh.name, names_to_keep=[
            "A",
        ])
        reader = FastA(fh.name)
        assert set(reader.names) == set(['A'])