Пример #1
0
class Pyrosample(object):
    """A Pyrosample is a legacy object for the few 454 samples we still have and that we need to compare against the new Illumina technology."""

    all_paths = """
    /info.json
    /reads.fasta
    /renamed.fasta
    /raw/raw.sff
    /raw/raw.fastq
    /raw/raw.fasta
    /raw/raw.qual
    /raw/manifest.txt
    /fastq/reads.fastq
    """

    kind = "pyrosample"

    def __repr__(self): return '<%s object "%s">' % (self.__class__.__name__, self.id_name)

    def __init__(self, json_path, out_dir):
        # Attributes #
        self.out_dir = out_dir
        self.json_path = FilePath(json_path)
        # Parse #
        self.info = load_json_path(self.json_path)
        # Basic #
        self.account = "/dev/null"
        self.run_num = self.info['run_num']
        self.run_label = "pyrosample_run_%i" % self.run_num
        self.project_short_name = self.info['project']
        self.project_long_name = self.info['project_name']
        # Own attributes #
        self.num = self.info['sample_num']
        self.short_name = self.info['sample']
        self.long_name = self.info['sample_name']
        self.name = 'run%i_sample%i' % (self.run_num, self.num)
        self.group = self.info['group']
        self.id_name = "run%03d-sample%02d" % (self.run_num, self.num)
        # Hard coded attributes #
        self.machine = "454 GS FLX Titanium"
        # SFF files #
        self.sff_files_info = self.info['files']
        # Pool dummy #
        self.pool, self.parent = self, self
        # Other dummy variables #
        self.bar_len = 0
        self.gzipped = False
        self.used = True
        # Loaded #
        self.loaded = False

    def load(self):
        """A second __init__ that is delayed and called only if needed"""
        # Check files are there #
        for f in self.sff_files_info:
            if not os.path.exists(f['path']): raise Exception("No file at %s" % f['path'])
        # Automatic paths #
        self.base_dir = self.out_dir + self.id_name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Make an alias to the json #
        self.p.info_json.link_from(self.json_path, safe=True)
        # Primer #
        self.primer_regex = re.compile(self.info['primer'])
        # Raw files #
        self.raw_fasta = FASTA(self.p.raw_fasta)
        self.raw_fastq = FASTQ(self.p.raw_fastq)
        # Standard FASTA #
        self.reads = FASTA(self.p.reads_fasta)
        self.fasta = FASTA(self.p.renamed)
        # Special FASTQ #
        self.fastq = FASTQ(self.p.reads_fastq)
        # A shameless hack for cdhit to work #
        self.renamed = self.fastq
        # Pre-denoised special case #
        if self.info['predenoised'] and False:
            self.sff_files_info = []
            self.reads.link_from(self.info['predenoised'], safe=True)
        # Special submission attributes #
        self.sra = PyroSampleSRA(self)
        # Loaded #
        self.loaded = True
        # Return self for convenience #
        return self

    @property
    def mate(self):
        if not 'mate' in self.info: return False
        run_num = self.info['mate']['run']
        pool_num = self.info['mate']['pool']
        barcode_num = self.info['mate']['num']
        return illumitag.runs[run_num][pool_num-1][barcode_num-1]

    def extract(self):
        # Call extraction #
        shell_output('sffinfo -s %s > %s' % (self.p.raw_sff, self.p.raw_fasta))
        shell_output('sffinfo -q %s > %s' % (self.p.raw_sff, self.p.raw_qual))
        shell_output('sffinfo -m %s > %s' % (self.p.raw_sff, self.p.manifest))
        # Convert #
        sh.fasta_to_fastq(self.p.raw_fasta, self.p.raw_qual, self.p.raw_fastq)

    def clean_iterator(self, reads, minlength=400, threshold=21, windowsize=20):
        for read in reads:
            # Length #
            if len(read) < minlength: continue
            # Primer #
            match = self.primer_regex.search(str(read.seq))
            if not match: continue
            # PHRED score #
            scores = read.letter_annotations["phred_quality"]
            averaged = moving_average(scores, windowsize)
            discard = False
            for i,value in enumerate(averaged):
                if value < threshold:
                    read = read[:i+windowsize-1]
                    if len(read) < minlength: discard = True
                    break
            if discard: continue
            # Undetermined bases #
            if 'N' in read: continue
            # Remove primer #
            read = read[match.end():]
            # Flip them because 454 reads the other end #
            read = read.reverse_complement()
            # Return #
            yield read

    def clean(self, **kwargs):
        self.reads.write(self.clean_iterator(self.raw_fastq, **kwargs))

    def report_loss(self):
        print "Before cleaning: %i" % len(self.raw_fastq)
        print "After cleaning: %i" % len(self.reads)
        print "Loss: %.2f%%" % (100 * (1 - (len(self.raw_fastq)/len(self.reads))))

    def process(self):
        self.reads.rename_with_num(self.name + '_read', new_path=self.fasta)

    def make_fastq(self, **kwargs):
        """In some special cases we want the FASTQ"""
        self.fastq.write(self.clean_iterator(self.raw_fastq, **kwargs))
        self.fastq.rename_with_num(self.name + '_read')
        print "make_fastq for sample %s completed" % self.id_name
Пример #2
0
class QualityReads(object):
    """A set of sequences determined to be quality controlled"""

    all_paths = """
    /mothur_reads.fasta
    /mothur_reads.qual
    /mothur_groups.tsv
    /qiime_reads.fasta
    /only_used_samples.fasta
    /trimmed.fasta
    """

    def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent)
    def __len__(self): return len(self.trimmed)

    def __init__(self, path, parent):
        # Save parent #
        self.parent, self.pool = parent, parent
        self.samples = parent.samples
        # Auto paths #
        self.base_dir = parent.p.quality_dir + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Files #
        self.untrimmed = BarcodedFASTQ(path, samples=self.samples)
        self.only_used = BarcodedFASTA(self.p.only_used, samples=self.samples)
        self.trimmed = FASTA(self.p.trimmed)
        # Qiime output #
        self.qiime_fasta = FASTA(self.p.qiime_fasta)
        # Mothur #
        self.mothur_fasta = FASTA(self.p.mothur_fasta)
        self.mothur_qual = QualFile(self.p.mothur_qual)
        self.mothur_groups = FilePath(self.p.mothur_groups)
        # Primer size #
        self.trim_fwd = self.pool.samples.trim_fwd
        self.trim_rev = self.pool.samples.trim_rev

    def filter_unused(self):
        def no_unused_iterator(reads):
            for r in reads.parse_barcodes():
                if r.first.sample.used: yield r.read
        self.only_used.write(no_unused_iterator(self.untrimmed))

    def trim_primers(self):
        def no_primers_iterator(reads):
            for read in reads:
                yield read[self.trim_fwd:-self.trim_rev]
        self.trimmed.write(no_primers_iterator(self.only_used))

    def make_mothur_output(self):
        # Trimmed fasta #
        self.mothur_fasta.link_from(self.trimmed.path)
        # The groups file #
        self.mothur_groups.create()
        for r in self.only_used.parse_barcodes():
            sample_name = r.first.sample.short_name
            read_name = '%s\t%s\n' % (r.read.id, sample_name)
            self.mothur_groups.handle.write(read_name)
        self.mothur_groups.close()

    def make_qiime_output(self):
        # Prepare fasta writer #
        handle = open(self.qiime_fasta.path, 'w')
        writer = FastaWriter(handle, wrap=0)
        writer.write_header()
        # Counter #
        counter = defaultdict(int)
        # Do it #
        for r in self.only_used.parse_barcodes():
            sample_name = r.first.sample.short_name
            counter[sample_name] += 1
            r.read.id = '%s_%i %s' % (sample_name, counter[sample_name], r.read.id)
            bar_seq = r.read.seq[0:self.pool.bar_len]
            r.read.description = "orig_bc=%s new_bc=%s bc_diffs=0" % (bar_seq, bar_seq)
            writer.write_record(r.read[self.trim_fwd:-self.trim_rev])
        # Close #
        writer.write_footer()
        handle.close()
Пример #3
0
class Cluster(object):
    """Analyzes a group of samples."""

    all_paths = """
    /reads/all_reads.fasta
    /otus/
    /logs/
    /report/report.pdf
    /metadata.csv
    """

    def __repr__(self): return '<%s object "%s" with %i samples>' % (self.__class__.__name__, self.name, len(self.samples))
    def __iter__(self): return iter(self.samples)
    def __len__(self): return len(self.samples)
    def __getitem__(self, key):
        if isinstance(key, basestring): return [c for c in self.children if c.short_name == key.lower()][0]
        elif isinstance(key, int) and hasattr(self.first, 'num'): return [c for c in self.children if c.num == key][0]
        else: return self.children[key]

    @property
    def first(self): return self.children[0]

    @property
    def count_seq(self):
        return sum([len(sample) for sample in self])

    def __init__(self, samples, name, base_dir=None):
        # Save samples #
        self.name = name
        self.samples, self.children = samples, samples
        # Check names are unique #
        names = [s.short_name for s in samples if s.used]
        assert len(names) == len(set(names))
        # Figure out pools #
        self.pools = list(set([s.pool for s in self.samples]))
        self.pools.sort(key = lambda x: x.id_name)
        # Directory #
        if base_dir: self.base_dir = base_dir
        else: self.base_dir = illumitag.view_dir + "clusters/" + self.name + '/'
        # Loaded #
        self.loaded = False

    def load(self):
        """A second __init__ that is delayed and called only if needed"""
        # Load the pools and samples #
        for p in self.pools: p.load()
        for s in self.samples: s.load()
        # Dir #
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Figure out if it's a project #
        if set(self.samples) == set(self.first.pool.project.samples): self.project = self.first.pool.project
        else: self.project = None
        # Runner #
        self.runner = ClusterRunner(self)
        # FASTA #
        self.reads = FASTA(self.p.all_reads_fasta)
        # OTU picking #
        self.otu_uparse = UparseOTUs(self)
        self.otu_uclust = UclustOTUs(self)
        self.otu_cdhit  = CdhitOTUs(self)
        # Preferred #
        self.otus = self.otu_uparse
        # Simple reporting #
        self.reporter = ClusterReporter(self)
        # Full report #
        self.report = ClusterReport(self)
        # Loaded #
        self.loaded = True
        # Return self for convenience #
        return self

    def run(self, *args, **kwargs):
        self.runner.run(*args, **kwargs)

    def run_slurm(self, *args, **kwargs):
        self.runner.run_slurm(*args, **kwargs)

    def process_samples(self):
        for sample in tqdm(self): sample.process()

    def combine_reads(self):
        """This is the first function should call. It will combine all the
        reads of all the samples of this cluster into one big FASTA file."""
        paths = [sample.fasta.path for sample in self]
        shell_output('cat %s > %s' % (' '.join(paths), self.reads))
        return self.reads

    def set_size(self, length):
        """Trim all sequences to a specific length starting from the end."""
        self.size_trimmed = FASTA(new_temp_path())
        def trim_iterator(reads):
            for read in reads:
                if len(read) < length: continue
                yield read[-length:]
        self.size_trimmed.write(trim_iterator(self.reads))
        self.size_trimmed.close()
        # Replace it #
        self.reads.remove()
        shutil.move(self.size_trimmed, self.reads)

    def run_uparse(self): self.otu_uparse.run()

    @property
    def metadata(self):
        return pandas.DataFrame([s.info for s in self], index=[s.short_name for s in self])

    def export_metadata(self):
        self.metadata.to_csv(self.p.metadata, sep='\t', encoding='utf-8')
Пример #4
0
class PrimerGroup(object):
    """A bunch of sequences all having the same type of primer outcome
    (and assembly outcome)"""

    all_paths = """
    /orig.fastq
    /n_filtered.fastq
    /qual_filtered.fastq
    /len_filtered.fastq
    /trimmed_barcodes.fasta
    """

    qual_threshold = 5
    qual_windowsize = 10
    min_length = 400

    def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent)
    def __len__(self): return len(self.orig_reads)

    def create(self): self.orig_reads.create()
    def add_seq(self, read): self.orig_reads.add_seq(read)
    def close(self): self.orig_reads.close()

    def __init__(self, parent):
        # Save parent #
        self.parent, self.assemble_group = parent, parent
        self.samples = parent.samples
        self.pool    = self.parent.pool
        self.primers = self.pool.primers
        # Auto paths #
        self.base_dir = parent.p.groups_dir + self.short_name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # More #
        self.orig_reads = self.parent.cls(self.p.orig_fastq, samples=self.samples)
        self.n_filtered = self.parent.cls(self.p.n_filtered, samples=self.samples)
        # Quality filtered #
        if self.parent == 'assembled':
            self.qual_filtered = BarcodedFASTQ(self.p.qual_filtered,      samples=self.samples, primers=self.primers)
            self.len_filtered =  BarcodedFASTQ(self.p.len_filtered_fastq, samples=self.samples, primers=self.primers)
            self.trimmed_barcodes = FASTA(self.p.trimmed_barcodes)
        # Further #
        self.load()

    def load(self): pass

    def n_filter(self):
        """Called from AssembleGroup.discard_reads_with_n"""
        def no_n_iterator(reads):
            fwd_len = self.pool.primers.fwd_len
            rev_len = self.pool.primers.rev_len
            for read in reads:
                if 'N' in read[fwd_len:-rev_len]: continue
                yield read
        self.n_filtered.write(no_n_iterator(self.orig_reads))

    def qual_filter(self):
        """Called from Assemble.quality_filter"""
        def good_qual_iterator(reads):
            for read in reads:
                averaged = moving_average(read.letter_annotations["phred_quality"], self.qual_windowsize)
                if any([value < self.qual_threshold for value in averaged]): continue
                yield read
        self.qual_filtered.write(good_qual_iterator(self.n_filtered))

    def len_filter(self):
        """Called from Assemble.length_filter"""
        def good_len_iterator(reads):
            for read in reads:
                if len(read) < self.min_length: continue
                yield read
        self.len_filtered.write(good_len_iterator(self.qual_filtered))

    def trim_bc(self):
        """Called from Assemble.trim_barcodes"""
        def no_barcodes_iterator(reads):
            for read in reads:
                yield read[self.pool.bar_len:-self.pool.bar_len]
        if self.pool.bar_len == 0:
            self.len_filtered.to_fasta(self.trimmed_barcodes)
        else:
            self.trimmed_barcodes.write(no_barcodes_iterator(self.len_filtered))
Пример #5
0
class UclustOTUs(OTUs):
    """Will use uclust via the qimme wraper to create OTU clusters from a given FASTA file
    http://qiime.org/scripts/pick_otus.html"""

    short_name = 'uclust'
    title = 'UCLUST-QIIME denovo picking'

    all_paths = """
    /clusters/clusters.uc
    /clusters/qiime.log
    /clusters/all_otus.txt
    /clusters/all_centers.fasta
    /centers.fasta
    /otus.txt
    /taxonomy_silva/
    /taxonomy_fw/
    /graphs/
    """

    def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent)

    def __init__(self, cluster):
        # Save parent #
        self.cluster, self.parent = cluster, cluster
        # Inherited #
        self.samples = self.parent.samples
        # Paths #
        self.base_dir = self.parent.p.otus_dir + self.short_name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Main FASTA file #
        self.reads = self.parent.reads
        # Files #
        self.all_otus = FilePath(self.p.all_otus)
        self.all_centers = FASTA(self.p.all_centers)
        self.otus = FilePath(self.base_dir + "otus.txt")
        self.centers = FASTA(self.base_dir + "centers.fasta")
        # Taxonomy #
        self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva)
        self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir)
        # Preferred one #
        self.taxonomy = self.taxonomy_silva

    def run(self):
        # Clean #
        shutil.rmtree(self.p.clusters_dir)
        # Run command #
        pick_otus = sh.Command('pick_otus.py')
        pick_otus('-m', 'uclust', '-i', self.reads, '-o', self.p.clusters_dir)
        # Move into place #
        base_name = self.p.clusters_dir + self.reads.prefix
        shutil.move(base_name + '_otus.txt', self.all_otus)
        shutil.move(base_name + '_otus.log', self.p.qiime_log)
        shutil.move(base_name + '_clusters.uc', self.p.clusters_uc)
        # Remove OTUs that are only one read #
        def filter_singletons(f):
            for line in f:
                line = line.split()
                if len(line) > 2: yield '\t'.join(line) + '\n'
        self.otus.writelines(filter_singletons(self.all_otus))
        # Create the centers file that is missing #
        pick_rep = sh.Command('pick_rep_set.py')
        pick_rep('-i', self.all_otus, '-f', self.reads, '-o', self.all_centers)
        # Remake the centers file without the filtered OTUs #
        self.otus_to_keep = [line.split()[0] for line in self.otus]
        def filter_otus(f):
            for seq in f:
                if seq.id in self.otus_to_keep: yield seq
        self.centers.write(filter_otus(self.all_centers))

    @property_cached
    def cluster_counts_table(self):
        """Create the unfiltered OTU table"""
        # Put results in a dict of dicts #
        result = defaultdict(lambda: defaultdict(int))
        # Loop #
        for line in self.otus:
            # Parse the line #
            contents = line.split()
            otu, reads = contents[0], contents[1:]
            # Parse the hits #
            for r in reads:
                nums = re.findall("run([0-9]+)_pool([0-9]+)_sample([0-9]+)_read([0-9]+)", r)
                if nums:
                    run_num, pool_num, sample_num, read_num = map(int, nums[0])
                    sample = illumitag.runs[run_num][pool_num-1][sample_num-1]
                    name = sample.short_name
                else:
                    nums = re.findall("run([0-9]+)_sample([0-9]+)_read([0-9]+)", r)
                    run_num, sample_num, read_num = map(int, nums[0])
                    sample = [s for s in illumitag.presamples+illumitag.pyrosamples if s.run_num==run_num and s.num==sample_num][0]
                    name = sample.short_name
                # Count #
                result[otu][name] += 1
        # Return #
        result = pandas.DataFrame(result)
        result = result.fillna(0)
        result = result.astype(int)
        result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1)
        return result