Exemplo n.º 1
0
 def combine_rerun_with_orig(self):
     """Special case when a sample with low reads was rerun in an other pool.
     Run this just before the combine_reads() method of the associated cluster.
     This method is called on the reruned sampled, not the original."""
     # Check we have a rerun #
     if self.info.get('rerun') is None: return False
     # Check we are processed #
     assert self.fasta.count > 0
     # Get the original sample #
     run, pool, num = self.info['rerun']['run'], self.info['rerun']['pool'], self.info['rerun']['num']
     orig_sample    = illumitag.runs[run][pool-1][num-1]
     merged         = FASTA(orig_sample.base_dir + 'rerun_merged.fasta')
     # Check we don't merge twice #
     assert orig_sample.count == orig_sample.fasta.count
     # Do it #
     merged.create()
     merged.add(orig_sample.fasta)
     merged.add(self.fasta)
     merged.close()
     merged.rename_with_num(orig_sample.name + '_read', orig_sample.fasta)
     merged.remove()
     # Check #
     orig_sample.fasta = FASTA(orig_sample.fasta.path)
     assert orig_sample.count < orig_sample.fasta.count
     return True
Exemplo n.º 2
0
 def fasta(self):
     """The fasta file containing the filtered genes of this cluster
     The names now will correspond to long descriptive names"""
     fasta = FASTA(self.p.fasta)
     if not fasta:
         fasta.create()
         for gene in self.filtered_genes: fasta.add_str(str(gene), name=gene.name)
         fasta.close()
     return fasta
Exemplo n.º 3
0
 def fasta(self):
     """The fasta file containing the filtered genes of this cluster
     The names now will correspond to long descriptive names"""
     fasta = FASTA(self.p.fasta)
     if not fasta:
         fasta.create()
         for gene in self.filtered_genes:
             fasta.add_str(str(gene), name=gene.name)
         fasta.close()
     return fasta
Exemplo n.º 4
0
 def fasta(self):
     """Make a fasta file with all uniprot proteins that are related to
     this family."""
     fasta = FASTA(self.p.proteins)
     if not fasta.exists:
         fasta.create()
         for seq in pfam.fasta:
             if self.fam_name in seq.description: fasta.add_seq(seq)
         fasta.close()
         assert fasta
     # Return #
     return fasta
Exemplo n.º 5
0
 def fasta(self):
     """Make a fasta file with all uniprot proteins that are related to
     this family."""
     fasta = FASTA(self.p.proteins)
     if not fasta.exists:
         fasta.create()
         for seq in pfam.fasta:
             if self.fam_name in seq.description: fasta.add_seq(seq)
         fasta.close()
         assert fasta
     # Return #
     return fasta
Exemplo n.º 6
0
 def test(self):
     """Search one sequence, and see if it works."""
     # New directory #
     directory = new_temp_dir()
     # A randomly chosen sequence (H**o sapiens mRNA for prepro cortistatin) #
     seq = """ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGGCCACGGCCACCGCTGCCCTGCC
     CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGC
     CTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGG
     AAGCTCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCC
     CTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAG
     TTTAATTACAGACCTGAA"""
     seq = seq.replace('\n','')
     seq = seq.replace(' ','')
     # Make input #
     input_fasta = FASTA(directory + 'input.fasta')
     input_fasta.create()
     input_fasta.add_str(seq, "My test sequence")
     input_fasta.close()
     # Make output #
     out_path = directory + 'output.blast'
     # Make extras parameters #
     params = {'-outfmt': 0,
               '-evalue': 1e-5,
               '-perc_identity': 99}
     # Make the search #
     search = SeqSearch(input_fasta,
                        self.blast_db,
                        'nucl',
                        'blast',
                        num_threads = 1,
                        out_path    = out_path,
                        params      = params)
     # Run it #
     search.run()
     # Print result #
     print "Success", directory
Exemplo n.º 7
0
 def test(self):
     """Search one sequence, and see if it works."""
     # New directory #
     directory = new_temp_dir()
     # A randomly chosen sequence (H**o sapiens mRNA for prepro cortistatin) #
     seq = """ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGGCCACGGCCACCGCTGCCCTGCC
     CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGC
     CTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGG
     AAGCTCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCC
     CTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAG
     TTTAATTACAGACCTGAA"""
     seq = seq.replace('\n','')
     seq = seq.replace(' ','')
     # Make input #
     input_fasta = FASTA(directory + 'input.fasta')
     input_fasta.create()
     input_fasta.add_str(seq, "My test sequence")
     input_fasta.close()
     # Make output #
     out_path = directory + 'output.blast'
     # Make extras parameters #
     params = {'-outfmt': 0,
               '-evalue': 1e-5,
               '-perc_identity': 99}
     # Make the search #
     search = SeqSearch(input_fasta,
                        self.blast_db,
                        'nucl',
                        'blast',
                        num_threads = 1,
                        out_path    = out_path,
                        params      = params)
     # Run it #
     search.run()
     # Print result #
     print("Success", directory)
Exemplo n.º 8
0
    fnas_genes = [strip(seq) for seq in fna]
    print faa, len(set(fnas_genes) ^ set(faas_genes)), "discrepancies"
    #print "- in fna but not in faa:", [x for x in set(fnas_genes) - set(faas_genes)]
    #print "- in faa but not in fna:", [x for x in set(faas_genes) - set(fnas_genes)]
    #print ""

fnas_genes = [strip(seq) for fna in fnas for seq in fna]
print len(fnas_genes), len(set(fnas_genes))

for genome in faas:
    out_path = genomes_dir + genome.short_prefix + '.fasta'
    out_fasta = FASTA(out_path)
    out_fasta.create()
    for seq in genome:
        out_fasta.add_str(str(seq.seq), strip(seq))
    out_fasta.close()
    out_fasta.gzip_to()
    out_fasta.remove()


def lines():
    for genome in faas:
        for gene in genome:
            name = strip(gene)
            yield name + '\t' + gene.description[len(name):].rstrip(
                ' |') + '\n'


annotations_path = current_dir + '../ld12/data/annotations.tsv'
with open(annotations_path, 'w') as handle:
    handle.writelines(lines())
Exemplo n.º 9
0
class PairedFASTA(object):
    """Read and write FASTA file pairs without using too much RAM"""
    format = 'fasta'

    def __len__(self):
        return self.count

    def __iter__(self):
        return self.parse()

    def __nonzero__(self):
        return bool(self.fwd) and bool(self.rev)

    def __repr__(self):        return '<%s object on "%s" and "%s">' % \
        (self.__class__.__name__, self.fwd.path, self.rev.path)

    def __enter__(self):
        return self.create()

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()

    @property
    def exists(self):
        return self.fwd.exists and self.rev.exists

    def __init__(self, fwd, rev, parent=None):
        # FASTA objects #
        self.fwd = FASTA(fwd)
        self.rev = FASTA(rev)
        # Extra #
        self.gzipped = self.fwd.gzipped
        self.parent = parent

    @property_cached
    def count(self):
        assert self.fwd.count == self.rev.count
        return self.fwd.count

    def open(self):
        self.fwd.open()
        self.rev.open()

    def parse(self):
        return izip(self.fwd.parse(), self.rev.parse())

    def close(self):
        self.fwd.close()
        self.rev.close()

    def create(self):
        self.fwd.create()
        self.rev.create()
        return self

    def add(self, f, r):
        return self.add_pair((f, r))

    def add_pair(self, pair):
        self.fwd.add_seq(pair[0])
        self.rev.add_seq(pair[1])

    def remove(self):
        self.fwd.remove()
        self.rev.remove()

    @property
    def progress(self):
        """Just like self.parse but display a progress bar"""
        return tqdm(self, total=len(self))

    def subsample(self, down_to, dest_pair=None):
        # Check size #
        assert down_to < len(self)
        # Make new pair of files #
        if dest_pair is None:
            dest_fwd_path = self.fwd_path.new_name_insert("subsampled")
            dest_rev_path = self.rev_path.new_name_insert("subsampled")
            dest_pair = self.__class__(dest_fwd_path, dest_rev_path)
        # Do it #
        dest_pair.create()
        for pair in isubsample(self, down_to):
            dest_pair.add_pair(pair)
        self.subsampled.close()
        # Did it work #
        assert len(dest_pair) == down_to

    #------------------------------- Extensions ------------------------------#
    def parse_primers(self, *args, **kwargs):
        fwd_gen = self.fwd.parse_primers(*args, **kwargs)
        rev_gen = self.rev.parse_primers(*args, **kwargs)
        generator = izip(fwd_gen, rev_gen)
        return GenWithLength(generator, len(fwd_gen))
Exemplo n.º 10
0
class PairedFASTA(object):
    """Read and write FASTA file pairs without using too much RAM"""
    format = 'fasta'

    def __len__(self): return self.count
    def __iter__(self): return self.parse()
    def __nonzero__(self): return bool(self.fwd) and bool(self.rev)
    def __repr__(self): return '<%s object on "%s" and "%s">' % \
                        (self.__class__.__name__, self.fwd.path, self.rev.path)

    def __enter__(self): return self.create()
    def __exit__(self, exc_type, exc_value, traceback): self.close()

    @property
    def exists(self): return self.fwd.exists and self.rev.exists

    def __init__(self, fwd, rev, parent=None):
        # FASTA objects #
        self.fwd = FASTA(fwd)
        self.rev = FASTA(rev)
        # Extra #
        self.gzipped = self.fwd.gzipped
        self.parent = parent

    @property_cached
    def count(self):
        assert self.fwd.count == self.rev.count
        return self.fwd.count

    def open(self):
        self.fwd.open()
        self.rev.open()

    def parse(self):
        return izip(self.fwd.parse(), self.rev.parse())

    def close(self):
        self.fwd.close()
        self.rev.close()

    def create(self):
        self.fwd.create()
        self.rev.create()
        return self

    def add(self, f, r):
        return self.add_pair((f,r))

    def add_pair(self, pair):
        self.fwd.add_seq(pair[0])
        self.rev.add_seq(pair[1])

    def remove(self):
        self.fwd.remove()
        self.rev.remove()

    @property
    def progress(self):
        """Just like self.parse but display a progress bar"""
        return tqdm(self, total=len(self))

    def subsample(self, down_to, dest_pair=None):
        # Check size #
        assert down_to < len(self)
        # Make new pair of files #
        if dest_pair is None:
            dest_fwd_path = self.fwd_path.new_name_insert("subsampled")
            dest_rev_path = self.rev_path.new_name_insert("subsampled")
            dest_pair = self.__class__(dest_fwd_path, dest_rev_path)
        # Do it #
        dest_pair.create()
        for pair in isubsample(self, down_to): dest_pair.add_pair(pair)
        self.subsampled.close()
        # Did it work #
        assert len(dest_pair) == down_to

    #------------------------------- Extensions ------------------------------#
    def parse_primers(self, *args, **kwargs):
        fwd_gen = self.fwd.parse_primers(*args, **kwargs)
        rev_gen = self.rev.parse_primers(*args, **kwargs)
        generator = izip(fwd_gen, rev_gen)
        return GenWithLength(generator, len(fwd_gen))
Exemplo n.º 11
0
    seq = seq.split('[')[0]
    return seq

for faa,fna in zip(faas, fnas):
    faas_genes = [strip(seq) for seq in faa]
    fnas_genes = [strip(seq) for seq in fna]
    print faa, len(set(fnas_genes) ^ set(faas_genes)), "discrepancies"
    #print "- in fna but not in faa:", [x for x in set(fnas_genes) - set(faas_genes)]
    #print "- in faa but not in fna:", [x for x in set(faas_genes) - set(fnas_genes)]
    #print ""

fnas_genes = [strip(seq) for fna in fnas for seq in fna]
print len(fnas_genes), len(set(fnas_genes))

for genome in faas:
    out_path = genomes_dir + genome.short_prefix + '.fasta'
    out_fasta = FASTA(out_path)
    out_fasta.create()
    for seq in genome: out_fasta.add_str(str(seq.seq), strip(seq))
    out_fasta.close()
    out_fasta.gzip_to()
    out_fasta.remove()

def lines():
    for genome in faas:
        for gene in genome:
            name = strip(gene)
            yield name + '\t' + gene.description[len(name):].rstrip(' |') + '\n'

annotations_path = current_dir + '../ld12/data/annotations.tsv'
with open(annotations_path, 'w') as handle: handle.writelines(lines())
Exemplo n.º 12
0
class Foraminifera(Database):
    """This is a custom database containing exlcusively Foraminifera sequences.

    https://genev.unige.ch/research/laboratory/Jan-Pawlowski

    You should place the file "foram_db_cor.fasta" in:  ~/databases/foraminifera/
    Then you can run this:
    
            from seqsearch.databases.foraminifera import foraminifera
            foraminifera.process()
            print foraminifera.tax_depth_freq

    """

    short_name = "foraminifera"
    long_name  = 'The custom made Foraminifera database as received by email on 7th April 2017'

    all_paths = """
    /foram_db_cor.fasta
    /foram_mothur.fasta
    /foram_mothur.tax
    """

    @property
    def rank_names(self):
        """The names of the ranks. Total 9 ranks."""
        return ['Domain',   # 0
                'Kingdom',  # 1
                'Phylum',   # 2
                'Class',    # 3
                'Order',    # 4
                'Family',   # 5
                'Tribe',    # 6
                'Genus',    # 7
                'Species']  # 8

    def __init__(self, base_dir=None):
        # Base directory #
        if base_dir is None: base_dir = home
        self.base_dir = base_dir + 'databases/' + self.short_name + '/'
        self.p        = AutoPaths(self.base_dir, self.all_paths)
        # The results #
        self.alignment = FASTA(self.p.mothur_fasta)
        self.taxonomy  = FilePath(self.p.mothur_tax)
        # The part that mothur will use for naming files #
        self.nickname = "foram_mothur"

    def process(self):
        # The file that was received by email without documentation T_T #
        raw = FASTA(self.p.cor)
        # Open files #
        self.alignment.create()
        self.taxonomy.create()
        # Loop #
        for seq in raw:
            # Parse #
            name = seq.id[11:].split('|')
            num  = name.pop(0)
            # Check #
            for x in name: assert ';' not in x
            for x in name: assert '\t' not in x
            # Make ranks #
            ranks = ['Eukaryota'                       , # 0 Domain
                     'Rhizaria'                        , # 1 Kingdom
                     'Foraminifera'                    , # 2 Phylum
                     name[0]                           , # 3 Class
                     name[1]                           , # 4 Order
                     name[2]                           , # 5 Family
                     name[3]                           , # 6 Tribe
                     name[4]                           , # 7 Genus
                     name[5]]                            # 8 Species
            # The taxonomy string #
            tax_line = ';'.join(ranks)
            # Add sequence to the new fasta file #
            self.alignment.add_str(str(seq.seq), name="foram" + num)
            # Add the taxonomy to the tax file #
            self.taxonomy.add_str("foram" + num + '\t' + tax_line + '\n')
        # Close files #
        self.alignment.close()
        self.taxonomy.close()
Exemplo n.º 13
0
class Cluster(object):
    """Analyzes a group of samples."""

    all_paths = """
    /reads/all_reads.fasta
    /otus/
    /logs/
    /report/report.pdf
    /metadata.csv
    """

    def __repr__(self): return '<%s object "%s" with %i samples>' % (self.__class__.__name__, self.name, len(self.samples))
    def __iter__(self): return iter(self.samples)
    def __len__(self): return len(self.samples)
    def __getitem__(self, key):
        if isinstance(key, basestring): return [c for c in self.children if c.short_name == key.lower()][0]
        elif isinstance(key, int) and hasattr(self.first, 'num'): return [c for c in self.children if c.num == key][0]
        else: return self.children[key]

    @property
    def first(self): return self.children[0]

    @property
    def count_seq(self):
        return sum([len(sample) for sample in self])

    def __init__(self, samples, name, base_dir=None):
        # Save samples #
        self.name = name
        self.samples, self.children = samples, samples
        # Check names are unique #
        names = [s.short_name for s in samples if s.used]
        assert len(names) == len(set(names))
        # Figure out pools #
        self.pools = list(set([s.pool for s in self.samples]))
        self.pools.sort(key = lambda x: x.id_name)
        # Directory #
        if base_dir: self.base_dir = base_dir
        else: self.base_dir = illumitag.view_dir + "clusters/" + self.name + '/'
        # Loaded #
        self.loaded = False

    def load(self):
        """A second __init__ that is delayed and called only if needed"""
        # Load the pools and samples #
        for p in self.pools: p.load()
        for s in self.samples: s.load()
        # Dir #
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Figure out if it's a project #
        if set(self.samples) == set(self.first.pool.project.samples): self.project = self.first.pool.project
        else: self.project = None
        # Runner #
        self.runner = ClusterRunner(self)
        # FASTA #
        self.reads = FASTA(self.p.all_reads_fasta)
        # OTU picking #
        self.otu_uparse = UparseOTUs(self)
        self.otu_uclust = UclustOTUs(self)
        self.otu_cdhit  = CdhitOTUs(self)
        # Preferred #
        self.otus = self.otu_uparse
        # Simple reporting #
        self.reporter = ClusterReporter(self)
        # Full report #
        self.report = ClusterReport(self)
        # Loaded #
        self.loaded = True
        # Return self for convenience #
        return self

    def run(self, *args, **kwargs):
        self.runner.run(*args, **kwargs)

    def run_slurm(self, *args, **kwargs):
        self.runner.run_slurm(*args, **kwargs)

    def process_samples(self):
        for sample in tqdm(self): sample.process()

    def combine_reads(self):
        """This is the first function should call. It will combine all the
        reads of all the samples of this cluster into one big FASTA file."""
        paths = [sample.fasta.path for sample in self]
        shell_output('cat %s > %s' % (' '.join(paths), self.reads))
        return self.reads

    def set_size(self, length):
        """Trim all sequences to a specific length starting from the end."""
        self.size_trimmed = FASTA(new_temp_path())
        def trim_iterator(reads):
            for read in reads:
                if len(read) < length: continue
                yield read[-length:]
        self.size_trimmed.write(trim_iterator(self.reads))
        self.size_trimmed.close()
        # Replace it #
        self.reads.remove()
        shutil.move(self.size_trimmed, self.reads)

    def run_uparse(self): self.otu_uparse.run()

    @property
    def metadata(self):
        return pandas.DataFrame([s.info for s in self], index=[s.short_name for s in self])

    def export_metadata(self):
        self.metadata.to_csv(self.p.metadata, sep='\t', encoding='utf-8')
Exemplo n.º 14
0
# Get clustering values #
r1, r2 = list(set([p.run for p in proj]))
r1.parse_report_xml()
r2.parse_report_xml()
print float(r1.report_stats['fwd']['DensityPF']) / float(r1.report_stats['fwd']['DensityRaw'])
print float(r2.report_stats['fwd']['DensityPF']) / float(r2.report_stats['fwd']['DensityRaw'])

# Check below 400 bp sequences #
folder = DirectoryPath(illumitag.projects['evaluation'].base_dir + "below_400/")
over = FASTA(folder + "reads.fasta")
def over_iterator(reads, max_length=400):
    for read in reads:
        if len(read) <= max_length: yield read
over.create()
for pool in pools: over.add_iterator(over_iterator(pool.good_barcodes.assembled.good_primers.qual_filtered))
over.close()
over.graphs[-1].plot()
crest = SimpleCrestTaxonomy(over, folder)
crest.assign()
crest.composition.graph.plot()
rdp = SimpleRdpTaxonomy(over, folder)
rdp.assign()
rdp.composition.graph.plot()

# Check unassembled mate pairs #
unassembled = [p.good_barcodes.unassembled for p in pools]
paths = [u.flipped_reads.path for u in unassembled]
folder = DirectoryPath(illumitag.projects['evaluation'].base_dir + "unassembled_taxonomy/")
all_unassembled = FASTA(folder + 'unassembled_reads.fasta')
shell_output('cat %s > %s' % (' '.join(paths), all_unassembled))
tax = SimpleRdpTaxonomy(all_unassembled, folder)