Exemplo n.º 1
0
class QualityReads(object):
    """A set of sequences determined to be quality controlled"""

    all_paths = """
    /mothur_reads.fasta
    /mothur_reads.qual
    /mothur_groups.tsv
    /qiime_reads.fasta
    /only_used_samples.fasta
    /trimmed.fasta
    """

    def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent)
    def __len__(self): return len(self.trimmed)

    def __init__(self, path, parent):
        # Save parent #
        self.parent, self.pool = parent, parent
        self.samples = parent.samples
        # Auto paths #
        self.base_dir = parent.p.quality_dir + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Files #
        self.untrimmed = BarcodedFASTQ(path, samples=self.samples)
        self.only_used = BarcodedFASTA(self.p.only_used, samples=self.samples)
        self.trimmed = FASTA(self.p.trimmed)
        # Qiime output #
        self.qiime_fasta = FASTA(self.p.qiime_fasta)
        # Mothur #
        self.mothur_fasta = FASTA(self.p.mothur_fasta)
        self.mothur_qual = QualFile(self.p.mothur_qual)
        self.mothur_groups = FilePath(self.p.mothur_groups)
        # Primer size #
        self.trim_fwd = self.pool.samples.trim_fwd
        self.trim_rev = self.pool.samples.trim_rev

    def filter_unused(self):
        def no_unused_iterator(reads):
            for r in reads.parse_barcodes():
                if r.first.sample.used: yield r.read
        self.only_used.write(no_unused_iterator(self.untrimmed))

    def trim_primers(self):
        def no_primers_iterator(reads):
            for read in reads:
                yield read[self.trim_fwd:-self.trim_rev]
        self.trimmed.write(no_primers_iterator(self.only_used))

    def make_mothur_output(self):
        # Trimmed fasta #
        self.mothur_fasta.link_from(self.trimmed.path)
        # The groups file #
        self.mothur_groups.create()
        for r in self.only_used.parse_barcodes():
            sample_name = r.first.sample.short_name
            read_name = '%s\t%s\n' % (r.read.id, sample_name)
            self.mothur_groups.handle.write(read_name)
        self.mothur_groups.close()

    def make_qiime_output(self):
        # Prepare fasta writer #
        handle = open(self.qiime_fasta.path, 'w')
        writer = FastaWriter(handle, wrap=0)
        writer.write_header()
        # Counter #
        counter = defaultdict(int)
        # Do it #
        for r in self.only_used.parse_barcodes():
            sample_name = r.first.sample.short_name
            counter[sample_name] += 1
            r.read.id = '%s_%i %s' % (sample_name, counter[sample_name], r.read.id)
            bar_seq = r.read.seq[0:self.pool.bar_len]
            r.read.description = "orig_bc=%s new_bc=%s bc_diffs=0" % (bar_seq, bar_seq)
            writer.write_record(r.read[self.trim_fwd:-self.trim_rev])
        # Close #
        writer.write_footer()
        handle.close()
Exemplo n.º 2
0
class Foraminifera(Database):
    """This is a custom database containing exlcusively Foraminifera sequences.

    https://genev.unige.ch/research/laboratory/Jan-Pawlowski

    You should place the file "foram_db_cor.fasta" in:  ~/databases/foraminifera/
    Then you can run this:
    
            from seqsearch.databases.foraminifera import foraminifera
            foraminifera.process()
            print foraminifera.tax_depth_freq

    """

    short_name = "foraminifera"
    long_name  = 'The custom made Foraminifera database as received by email on 7th April 2017'

    all_paths = """
    /foram_db_cor.fasta
    /foram_mothur.fasta
    /foram_mothur.tax
    """

    @property
    def rank_names(self):
        """The names of the ranks. Total 9 ranks."""
        return ['Domain',   # 0
                'Kingdom',  # 1
                'Phylum',   # 2
                'Class',    # 3
                'Order',    # 4
                'Family',   # 5
                'Tribe',    # 6
                'Genus',    # 7
                'Species']  # 8

    def __init__(self, base_dir=None):
        # Base directory #
        if base_dir is None: base_dir = home
        self.base_dir = base_dir + 'databases/' + self.short_name + '/'
        self.p        = AutoPaths(self.base_dir, self.all_paths)
        # The results #
        self.alignment = FASTA(self.p.mothur_fasta)
        self.taxonomy  = FilePath(self.p.mothur_tax)
        # The part that mothur will use for naming files #
        self.nickname = "foram_mothur"

    def process(self):
        # The file that was received by email without documentation T_T #
        raw = FASTA(self.p.cor)
        # Open files #
        self.alignment.create()
        self.taxonomy.create()
        # Loop #
        for seq in raw:
            # Parse #
            name = seq.id[11:].split('|')
            num  = name.pop(0)
            # Check #
            for x in name: assert ';' not in x
            for x in name: assert '\t' not in x
            # Make ranks #
            ranks = ['Eukaryota'                       , # 0 Domain
                     'Rhizaria'                        , # 1 Kingdom
                     'Foraminifera'                    , # 2 Phylum
                     name[0]                           , # 3 Class
                     name[1]                           , # 4 Order
                     name[2]                           , # 5 Family
                     name[3]                           , # 6 Tribe
                     name[4]                           , # 7 Genus
                     name[5]]                            # 8 Species
            # The taxonomy string #
            tax_line = ';'.join(ranks)
            # Add sequence to the new fasta file #
            self.alignment.add_str(str(seq.seq), name="foram" + num)
            # Add the taxonomy to the tax file #
            self.taxonomy.add_str("foram" + num + '\t' + tax_line + '\n')
        # Close files #
        self.alignment.close()
        self.taxonomy.close()