Пример #1
0
 def fasta(self):
     """The fasta file containing the filtered genes of this cluster
     The names now will correspond to long descriptive names"""
     fasta = FASTA(self.p.fasta)
     if not fasta:
         fasta.create()
         for gene in self.filtered_genes: fasta.add_str(str(gene), name=gene.name)
         fasta.close()
     return fasta
Пример #2
0
 def fasta(self):
     """The fasta file containing the filtered genes of this cluster
     The names now will correspond to long descriptive names"""
     fasta = FASTA(self.p.fasta)
     if not fasta:
         fasta.create()
         for gene in self.filtered_genes:
             fasta.add_str(str(gene), name=gene.name)
         fasta.close()
     return fasta
Пример #3
0
 def test(self):
     """Search one sequence, and see if it works."""
     # New directory #
     directory = new_temp_dir()
     # A randomly chosen sequence (H**o sapiens mRNA for prepro cortistatin) #
     seq = """ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGGCCACGGCCACCGCTGCCCTGCC
     CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGC
     CTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGG
     AAGCTCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCC
     CTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAG
     TTTAATTACAGACCTGAA"""
     seq = seq.replace('\n','')
     seq = seq.replace(' ','')
     # Make input #
     input_fasta = FASTA(directory + 'input.fasta')
     input_fasta.create()
     input_fasta.add_str(seq, "My test sequence")
     input_fasta.close()
     # Make output #
     out_path = directory + 'output.blast'
     # Make extras parameters #
     params = {'-outfmt': 0,
               '-evalue': 1e-5,
               '-perc_identity': 99}
     # Make the search #
     search = SeqSearch(input_fasta,
                        self.blast_db,
                        'nucl',
                        'blast',
                        num_threads = 1,
                        out_path    = out_path,
                        params      = params)
     # Run it #
     search.run()
     # Print result #
     print "Success", directory
Пример #4
0
 def test(self):
     """Search one sequence, and see if it works."""
     # New directory #
     directory = new_temp_dir()
     # A randomly chosen sequence (H**o sapiens mRNA for prepro cortistatin) #
     seq = """ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGGCCACGGCCACCGCTGCCCTGCC
     CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGC
     CTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGG
     AAGCTCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCC
     CTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAG
     TTTAATTACAGACCTGAA"""
     seq = seq.replace('\n','')
     seq = seq.replace(' ','')
     # Make input #
     input_fasta = FASTA(directory + 'input.fasta')
     input_fasta.create()
     input_fasta.add_str(seq, "My test sequence")
     input_fasta.close()
     # Make output #
     out_path = directory + 'output.blast'
     # Make extras parameters #
     params = {'-outfmt': 0,
               '-evalue': 1e-5,
               '-perc_identity': 99}
     # Make the search #
     search = SeqSearch(input_fasta,
                        self.blast_db,
                        'nucl',
                        'blast',
                        num_threads = 1,
                        out_path    = out_path,
                        params      = params)
     # Run it #
     search.run()
     # Print result #
     print("Success", directory)
Пример #5
0
    faas_genes = [strip(seq) for seq in faa]
    fnas_genes = [strip(seq) for seq in fna]
    print faa, len(set(fnas_genes) ^ set(faas_genes)), "discrepancies"
    #print "- in fna but not in faa:", [x for x in set(fnas_genes) - set(faas_genes)]
    #print "- in faa but not in fna:", [x for x in set(faas_genes) - set(fnas_genes)]
    #print ""

fnas_genes = [strip(seq) for fna in fnas for seq in fna]
print len(fnas_genes), len(set(fnas_genes))

for genome in faas:
    out_path = genomes_dir + genome.short_prefix + '.fasta'
    out_fasta = FASTA(out_path)
    out_fasta.create()
    for seq in genome:
        out_fasta.add_str(str(seq.seq), strip(seq))
    out_fasta.close()
    out_fasta.gzip_to()
    out_fasta.remove()


def lines():
    for genome in faas:
        for gene in genome:
            name = strip(gene)
            yield name + '\t' + gene.description[len(name):].rstrip(
                ' |') + '\n'


annotations_path = current_dir + '../ld12/data/annotations.tsv'
with open(annotations_path, 'w') as handle:
Пример #6
0
    seq = seq.split('[')[0]
    return seq

for faa,fna in zip(faas, fnas):
    faas_genes = [strip(seq) for seq in faa]
    fnas_genes = [strip(seq) for seq in fna]
    print faa, len(set(fnas_genes) ^ set(faas_genes)), "discrepancies"
    #print "- in fna but not in faa:", [x for x in set(fnas_genes) - set(faas_genes)]
    #print "- in faa but not in fna:", [x for x in set(faas_genes) - set(fnas_genes)]
    #print ""

fnas_genes = [strip(seq) for fna in fnas for seq in fna]
print len(fnas_genes), len(set(fnas_genes))

for genome in faas:
    out_path = genomes_dir + genome.short_prefix + '.fasta'
    out_fasta = FASTA(out_path)
    out_fasta.create()
    for seq in genome: out_fasta.add_str(str(seq.seq), strip(seq))
    out_fasta.close()
    out_fasta.gzip_to()
    out_fasta.remove()

def lines():
    for genome in faas:
        for gene in genome:
            name = strip(gene)
            yield name + '\t' + gene.description[len(name):].rstrip(' |') + '\n'

annotations_path = current_dir + '../ld12/data/annotations.tsv'
with open(annotations_path, 'w') as handle: handle.writelines(lines())
Пример #7
0
class Foraminifera(Database):
    """This is a custom database containing exlcusively Foraminifera sequences.

    https://genev.unige.ch/research/laboratory/Jan-Pawlowski

    You should place the file "foram_db_cor.fasta" in:  ~/databases/foraminifera/
    Then you can run this:
    
            from seqsearch.databases.foraminifera import foraminifera
            foraminifera.process()
            print foraminifera.tax_depth_freq

    """

    short_name = "foraminifera"
    long_name  = 'The custom made Foraminifera database as received by email on 7th April 2017'

    all_paths = """
    /foram_db_cor.fasta
    /foram_mothur.fasta
    /foram_mothur.tax
    """

    @property
    def rank_names(self):
        """The names of the ranks. Total 9 ranks."""
        return ['Domain',   # 0
                'Kingdom',  # 1
                'Phylum',   # 2
                'Class',    # 3
                'Order',    # 4
                'Family',   # 5
                'Tribe',    # 6
                'Genus',    # 7
                'Species']  # 8

    def __init__(self, base_dir=None):
        # Base directory #
        if base_dir is None: base_dir = home
        self.base_dir = base_dir + 'databases/' + self.short_name + '/'
        self.p        = AutoPaths(self.base_dir, self.all_paths)
        # The results #
        self.alignment = FASTA(self.p.mothur_fasta)
        self.taxonomy  = FilePath(self.p.mothur_tax)
        # The part that mothur will use for naming files #
        self.nickname = "foram_mothur"

    def process(self):
        # The file that was received by email without documentation T_T #
        raw = FASTA(self.p.cor)
        # Open files #
        self.alignment.create()
        self.taxonomy.create()
        # Loop #
        for seq in raw:
            # Parse #
            name = seq.id[11:].split('|')
            num  = name.pop(0)
            # Check #
            for x in name: assert ';' not in x
            for x in name: assert '\t' not in x
            # Make ranks #
            ranks = ['Eukaryota'                       , # 0 Domain
                     'Rhizaria'                        , # 1 Kingdom
                     'Foraminifera'                    , # 2 Phylum
                     name[0]                           , # 3 Class
                     name[1]                           , # 4 Order
                     name[2]                           , # 5 Family
                     name[3]                           , # 6 Tribe
                     name[4]                           , # 7 Genus
                     name[5]]                            # 8 Species
            # The taxonomy string #
            tax_line = ';'.join(ranks)
            # Add sequence to the new fasta file #
            self.alignment.add_str(str(seq.seq), name="foram" + num)
            # Add the taxonomy to the tax file #
            self.taxonomy.add_str("foram" + num + '\t' + tax_line + '\n')
        # Close files #
        self.alignment.close()
        self.taxonomy.close()