def splitFastaFile(infile, informat, outdir):
    for record in SeqIO.parse(open(infile), informat):
        iid = record.id
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        f_out = os.path.join(outdir,iid+'.fasta')
        SeqIO.write([record],open(f_out,'w'),"fasta")
Пример #2
0
 def test_generated(self):
     """Write and read back odd SeqRecord objects"""
     record1 = SeqRecord(Seq("ACGT"*500, generic_dna),  id="Test", description="Long "*500,
                        letter_annotations={"phred_quality":[40,30,20,10]*500})
     record2 = SeqRecord(MutableSeq("NGGC"*1000),  id="Mut", description="very "*1000+"long",
                        letter_annotations={"phred_quality":[0,5,5,10]*1000})
     record3 = SeqRecord(UnknownSeq(2000,character="N"),  id="Unk", description="l"+("o"*1000)+"ng",
                        letter_annotations={"phred_quality":[0,1]*1000})
     record4 = SeqRecord(Seq("ACGT"*500),  id="no_descr", description="", name="",
                        letter_annotations={"phred_quality":[40,50,60,62]*500})
     record5 = SeqRecord(Seq("",generic_dna),  id="empty_p", description="(could have been trimmed lots)",
                        letter_annotations={"phred_quality":[]})
     record6 = SeqRecord(Seq(""),  id="empty_s", description="(could have been trimmed lots)",
                        letter_annotations={"solexa_quality":[]})
     record7 = SeqRecord(Seq("ACNN"*500),  id="Test_Sol", description="Long "*500,
                        letter_annotations={"solexa_quality":[40,30,0,-5]*500})
     record8 = SeqRecord(Seq("ACGT"),  id="HighQual", description="With very large qualities that even Sanger FASTQ can't hold!",
                        letter_annotations={"solexa_quality":[0,10,100,1000]})
     #TODO - Record with no identifier?
     records = [record1, record2, record3, record4, record5, record6, record7, record8]
     #TODO - Have a Biopython defined "DataLossWarning?"
     warnings.simplefilter('ignore', BiopythonWarning)
     #TODO - Include phd output?
     for format in ["fasta", "fastq", "fastq-solexa", "fastq-illumina", "qual"]:
         handle = StringIO()
         SeqIO.write(records, handle, format)
         handle.seek(0)
         compare_records(records,
                         list(SeqIO.parse(handle, format)),
                         truncation_expected(format))
     warnings.filters.pop()
Пример #3
0
 def loop(self, filename, format):
     original_records = list(SeqIO.parse(open(filename, "rU"), format))
     # now open a connection to load the database
     server = BioSeqDatabase.open_database(driver = DBDRIVER,
                                           user = DBUSER, passwd = DBPASSWD,
                                           host = DBHOST, db = TESTDB)
     db_name = "test_loop_%s" % filename  # new namespace!
     db = server.new_database(db_name)
     count = db.load(original_records)
     self.assertEqual(count, len(original_records))
     server.commit()
     #Now read them back...
     biosql_records = [db.lookup(name=rec.name)
                       for rec in original_records]
     #And check they agree
     self.assertTrue(compare_records(original_records, biosql_records))
     #Now write to a handle...
     handle = StringIO()
     SeqIO.write(biosql_records, handle, "gb")
     #Now read them back...
     handle.seek(0)
     new_records = list(SeqIO.parse(handle, "gb"))
     #And check they still agree
     self.assertEqual(len(new_records), len(original_records))
     for old, new in zip(original_records, new_records):
         #TODO - remove this hack because we don't yet write these (yet):
         for key in ["comment", "references", "db_source"]:
             if key in old.annotations and key not in new.annotations:
                 del old.annotations[key]
         self.assertTrue(compare_record(old, new))
     #Done
     server.close()
Пример #4
0
def run_pal2nal(fname_aln, fname_nuc, fname_prot):
    """
    Generate a codon alignment via PAL2NAL.

    @param fname_aln:
        MSA of protein sequences in CLUSTAL format (.aln)
    @param fname_nuc:
        Nucleotide sequences in FASTA format (.fasta)
    @param fname_prot:
        Protein sequences in FASTA format (.fasta)
    @return:
        Codon alignment in CLUSTAL format (.aln), suitable for codeml
    1"""
    sys.stderr.write("\nSTEP: run_pal2nal(%s, %s)\n" % (fname_aln, fname_nuc))

    # Reorder fname_nuc according to the order of the proteins in fname_aln, which
    # was reordered due to CLUSTALW2.  Note that the first protein in each of
    # these files remains the same as at the start, however; this first protein
    # is our original query protein.
    nuc_records = [record for record in SeqIO.parse(fname_nuc, "fasta")]
    prot_records = [record for record in SeqIO.parse(fname_prot, "fasta")]
    records_map = dict((pr.id, nr) for pr, nr in zip(prot_records, nuc_records))
    fname_nuc2 = "homologs_ordered.dna.fasta"
    with open(fname_nuc2, "w") as f:
        for record in SeqIO.parse(fname_aln, "clustal"):
            SeqIO.write(records_map[record.id], f, "fasta")
    fname_codon = "homologs.codon.aln"
    # TODO: use subprocess
    os.system("%s/pal2nal.pl %s %s -output paml > %s" % (bin_dir(), fname_aln, fname_nuc2, fname_codon))
    return fname_codon
Пример #5
0
 def test_fastq_2000(self):
     """Read and write back simple example with upper case 2000bp read"""
     data = "@%s\n%s\n+\n%s\n" \
            % ("id descr goes here", "ACGT"*500, "!@a~"*500)
     handle = StringIO()
     self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq"))
     self.assertEqual(data, handle.getvalue())
Пример #6
0
 def setUp(self):
     self.aln_file = [TEST_ALIGN_FILE1,
                      TEST_ALIGN_FILE2,
                      TEST_ALIGN_FILE3,
                      TEST_ALIGN_FILE4,
                      TEST_ALIGN_FILE5,
                      TEST_ALIGN_FILE6]
     alns = []
     for i in self.aln_file:
         if i[1] == 'parse':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet)
         elif i[1] == 'index':
             nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20)
         elif i[1] == 'id':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with open(i[0][2]) as handle:
                 id = dict((i.split()[0], i.split()[1]) for i in handle)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet)
         alns.append(caln)
         nucl.close()  # Close the indexed FASTA file
     self.alns = alns
Пример #7
0
def needle_score(seq1, seq2, verbose=False, keep=False):
    """
    get needlman-wunsch score for aligning two sequences
    """
    ntf = tempfile.NamedTemporaryFile
    with ntf(prefix='seq1', delete = not keep) as fh1, \
         ntf(prefix='seq2', delete = not keep) as fh2, \
         ntf(prefix='align_out') as outfile, \
         open(os.devnull) as dn:
        SeqIO.write(seq1, fh1, 'fasta')
        fh1.flush()
        SeqIO.write(seq2, fh2, 'fasta')
        fh2.flush()

        cmd = ['needle', '-gapopen', '0',
               '-gapextend', '0',
               '-outfile',  outfile.name,
               fh1.name, fh2.name]
        if verbose:
            print(' '.join(cmd))
        subprocess.check_call(cmd, stderr=dn)
        result = outfile.read()
        pattern = re.compile(r'# Score: (.*)')
        score = pattern.search(result)
        if score is not None:
            return float(score.group(1))
        return 0
Пример #8
0
def frameshift_writer(contigs, file):
    sys.stderr.write("[predict] writing frameshifts...")
    seqs = [SeqRecord(seq=c.seq, id=c.id, description=c.description) for c in contigs.values()
            if c.annotation['majority_frameshift']]
    SeqIO.write(seqs, file, "fasta")
    file.close()
    sys.stderr.write("\tdone.\n")
Пример #9
0
def no_relatives_writer(contigs, file):
    sys.stderr.write("[predict] writing contigs with no relatives...")
    seqs = [SeqRecord(seq=c.seq, id=c.id, description=c.description) for c in contigs.values() if
            c.annotation['num_relatives'] == 0]
    SeqIO.write(seqs, file, "fasta")
    file.close()
    sys.stderr.write("\tdone.\n")
Пример #10
0
    def __format__(self, format_spec):
        """Returns the record as a string in the specified file format.

        This method supports the python format() function added in
        Python 2.6/3.0.  The format_spec should be a lower case string
        supported by Bio.SeqIO as an output file format. See also the
        SeqRecord's format() method.
        """
        if not format_spec:
            #Follow python convention and default to using __str__
            return str(self)    
        from Bio import SeqIO
        if format_spec in SeqIO._BinaryFormats:
            #Return bytes on Python 3
            try:
                #This is in Python 2.6+, but we need it on Python 3
                from io import BytesIO
                handle = BytesIO()
            except ImportError:
                #Must be on Python 2.5 or older
                from StringIO import StringIO
                handle = StringIO()
        else:
            from StringIO import StringIO
            handle = StringIO()
        SeqIO.write(self, handle, format_spec)
        return handle.getvalue()
Пример #11
0
def main():
    if len (sys.argv) != 4 :
        print "Please provide file, the file format, and the desired file format "
        sys.exit (1)
    else:
        f = sys.argv[1]
        fout = "".join(f.split('.')[:-1])
        formatin = sys.argv[2]
        formatout  = sys.argv[3]
        if formatout == 'nexus':
            AlignIO.convert(f,formatin,fout+'.'+formatout,formatout,alphabet= IUPAC.ambiguous_dna)
        if formatout == 'mega':
            handle = open(f, "rU")
            record_dict = SeqIO.to_dict(SeqIO.parse(handle, "phylip-relaxed"))
            handle.close()
            
            outfile = open(fout+'.'+formatout,'w')
            outfile.write('#mega'+"\n")
            outfile.write('!Title Mytitle;'+"\n")
            outfile.write('!Format DataType=DNA indel=-;'+"\n\n")
            
            for n in record_dict:
                outfile.write('#'+n+"\n")
                newseq=wrap(str(record_dict[n].seq),60)
                for s in newseq:
                    outfile.write(s+"\n")
            
            outfile.close()
        else:
            AlignIO.convert(f,formatin,fout+'.'+formatout,formatout)
Пример #12
0
def filter_reads_by_length(fq1, fq2, quality_format, min_length=20):
    """
    removes reads from a pair of fastq files that are shorter than
    a minimum length. removes both ends of a read if one end falls
    below the threshold while maintaining the order of the reads

    """

    logger.info("Removing reads in %s and %s that "
                "are less than %d bases." % (fq1, fq2, min_length))
    fq1_out = utils.append_stem(fq1, ".fixed")
    fq2_out = utils.append_stem(fq2, ".fixed")
    fq1_single = utils.append_stem(fq1, ".singles")
    fq2_single = utils.append_stem(fq2, ".singles")
    if all(map(utils.file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])):
        return [fq1_out, fq2_out]

    fq1_in = SeqIO.parse(fq1, quality_format)
    fq2_in = SeqIO.parse(fq2, quality_format)

    with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle:
        for fq1_record, fq2_record in izip(fq1_in, fq2_in):
            if len(fq1_record.seq) >= min_length and len(fq2_record.seq) >= min_length:
                fq1_out_handle.write(fq1_record.format(quality_format))
                fq2_out_handle.write(fq2_record.format(quality_format))
            else:
                if len(fq1_record.seq) > min_length:
                    fq1_single_handle.write(fq1_record.format(quality_format))
                if len(fq2_record.seq) > min_length:
                    fq2_single_handle.write(fq2_record.format(quality_format))

    return [fq1_out, fq2_out]
Пример #13
0
def gather_est2genome_seqs(refseq_obj, est2genome_handle, log_line, velvet_file):
	seq_dir = log_line.split("\t")[1]
	tmp_refseq = seq_dir.split("/")[3].replace(".","%2E")#hardcoded in this position
	gff_file = refseq_obj.id + ".velvet_contigs.maker.output/" + seq_dir + "/" + tmp_refseq + ".gff"
	gff_handle = open(gff_file,'r')
	for gff_line in gff_handle:
		if(re.search("est2gneome",gff_line) and \
		re.search("\texpressed_sequence_match\t",gff_line)):
			curr_start = int(gff_line.split("\t")[3])
			curr_stop = int(gff_line.split("\t")[4])
			curr_strand = gff_line.split("\t")[6]
			
			tmp_handle = open(velvet_file,'r')
			tmp_fasta = SeqIO.to_dict(SeqIO.parse(tmp_handle,"fasta"))
			tmp_handle.close()
		
			if seq_dir.split("/")[3] in tmp_fasta:
				curr_record = tmp_fasta[seq_dir.split("/")[3]]
			else:
				continue
			new_seq = curr_record.seq[curr_start - 1:curr_stop]
			if(curr_strand == "-"):
				new_seq = curr_record.seq[curr_start - 1:curr_stop].reverse_complement()
			new_record = SeqRecord(new_seq,id=seqname,name=seqname,description="")
				
			SeqIO.write(est2genome_handle,"fasta")
Пример #14
0
def check_convert_fails(in_filename, in_format, out_format, alphabet=None):
    qual_truncate = truncation_expected(out_format)
    #We want the SAME error message from parse/write as convert!
    err1 = None
    try:
        records = list(SeqIO.parse(in_filename,in_format, alphabet))
        handle = StringIO()
        if qual_truncate:
            warnings.simplefilter('ignore', UserWarning)
        SeqIO.write(records, handle, out_format)
        if qual_truncate:
            warnings.filters.pop()
        handle.seek(0)
        assert False, "Parse or write should have failed!"
    except ValueError as err:
        err1 = err
    #Now do the conversion...
    try:
        handle2 = StringIO()
        if qual_truncate:
            warnings.simplefilter('ignore', UserWarning)
        SeqIO.convert(in_filename, in_format, handle2, out_format, alphabet)
        if qual_truncate:
            warnings.filters.pop()
        assert False, "Convert should have failed!"
    except ValueError as err2:
        assert str(err1) == str(err2), \
               "Different failures, parse/write:\n%s\nconvert:\n%s" \
               % (err1, err2)
Пример #15
0
Файл: z4.py Проект: strop/WBO_11
def main_build_markov(promotor_filename = "promotor.fa", genome_filename = "genom.fa", symbol_length = 2, load_cached = False, save_cache = True):
  ''' Na podstawie plików z sekwencjami promotorowymi i genomem funkcja buduje model Markova'''
  promotor_sequences = [ x for x in SeqIO.parse("promotor.fa", "fasta")]
  genome = [ x for x in SeqIO.parse("genom.fa", "fasta")]
  if not load_cached:
    promotor_freqs = calc_symbol_freq(promotor_sequences)
    genome_freqs = calc_symbol_freq(genome)
    if save_cache:
      dump_obj(promotor_freqs, Dumpfiles.promotor_freq)
      dump_obj(genome_freqs, Dumpfiles.genome_freq)
  else:
    promotor_freqs = load_obj(Dumpfiles.promotor_freq)
    genome_freqs = load_obj(Dumpfiles.genome_freq)
  
  promotor_counts = calc_counts(promotor_sequences)
  genome_counts = calc_counts(genome)

  print promotor_counts

  promotor_freqs = fold_and_normalize(promotor_freqs[symbol_length], symbol_length, promotor_counts[symbol_length])
  genome_freqs = fold_and_normalize(genome_freqs[symbol_length], symbol_length, genome_counts[symbol_length])

  for k in promotor_freqs:
    assert(k in genome_freqs)
  for k in genome_freqs:
    assert(k in genome_freqs)

  print promotor_freqs
  (markov, states) = build_markov(genome_freqs, promotor_freqs)

  return (markov, states)
def illumina2sangerFq(inputfile):
    
    print help(SeqIO.convert)
    
    filename = inputfile[:-3]+'.fastq'
    
    SeqIO.convert(inputfile, "fastq-illumina", filename, "fastq")
Пример #17
0
def cluster_pid(folder):
    result = []
    f_name = folder.split("/")[-1]
    try:
        genes = pd.read_csv(folder + "/report/" + f_name + "_genes.csv")
        genes = genes.loc[~(genes['cluster'].isin(['na', '0', 0])) & (genes['species'] == 'Homo sapiens')]
        if genes.shape[0] > 0:
            for cluster in set(genes['cluster']):
                pids = []
                accs = genes.loc[genes['cluster'] == cluster, 'prot_acc'].values
                for seq1 in accs:
                    for seq2 in accs:
                        seq_1 = [x.seq for x in SeqIO.parse("../cgpf_ncbi/all_seqs.fa", 'fasta') if
                                 x.name.split("|")[2] == seq1]
                        seq_2 = [x.seq for x in SeqIO.parse("../cgpf_ncbi/all_seqs.fa", 'fasta') if
                                 x.name.split("|")[2] == seq2]
                        aln = pairwise2.align.globalxx(seq_1[0], seq_2[0])[0]
                        mean_len = (len(aln[0]) + len(aln[1])) / 2
                        pids.append(aln[2] / mean_len)

                n_genes = len(pids)
                mean_pid = np.mean(pids)
                sd_id = np.std(pids)
                result.append(cluster, n_genes, mean_pid, sd_id)
                print(cluster)
        return result
    except OSError:
        return None
Пример #18
0
def CutOutDomain(coords,filename, header=False, column_id=0, column_start=8, column_stop=9):
    """COMMENTS"""
    from Bio import SeqIO
    fh=open(coords)
    seqfile=open(filename)
    Towrite=[]
    CoordIDDic={}
    if header==True:
        print 'header set to True, first line of %s will be ignored'%coords
        skip_header=fh.readline()
    else:
        print 'header not set to True, first line of %s will be processed'%coords

    for unformatedLine in fh:
        l=unformatedLine.replace('\xa0', '').strip().split(',')
        if l[column_id] not in CoordIDDic:
            CoordIDDic[l[column_id]]=l[column_start], l[column_stop]
    else:
        for s in SeqIO.parse(seqfile, 'fasta'):
            if s.id in CoordIDDic:
                start=(int(CoordIDDic.get(s.id)[0])-1)
                stop=int(CoordIDDic.get(s.id)[1])
                s.id=s.id+'_%s_%s'%((start+1), stop)
                Towrite.append(s[start:stop])
        else:
            Output=open('CutOutdomain_%s'%filename, 'w')
            SeqIO.write(Towrite, Output, 'fasta')
Пример #19
0
def main(gbdir, outdir):
    os.makedirs(gbdir, exist_ok=True)
    os.makedirs(outdir, exist_ok=True)
    tempq = 'tempquery.fasta'
    tempdb = 'tempdb.fasta'
    for org in tqdm(Organism.objects.all()):
        # get genbank and convert to fasta
        fpath = os.path.join(gbdir, '{}.gb'.format(org.accession))
        if not os.path.isfile(fpath):
            print('\nFetching {} with accession {}'.format(
                org.name,
                org.accession
            ))
            fetch(fpath)
        SeqIO.convert(fpath, 'genbank', tempdb, 'fasta')
        # get spacers of organism and convert to fasta
        spacers = Spacer.objects.filter(loci__organism=org)
        fastatext = ''.join(['>{}\n{}\n'.format(spacer.id, spacer.sequence)
                             for spacer in spacers])
        with open(tempq, 'w') as f:
            f.write(fastatext)
        # run blast and save output
        outpath = os.path.join(outdir, '{}.json'.format(org.accession))
        commandargs = ['blastn', '-query', tempq,
                       '-subject', tempdb, '-out', outpath, '-outfmt', '15']
        subprocess.run(commandargs, stdout=subprocess.DEVNULL)

    os.remove(tempq)
    os.remove(tempdb)
Пример #20
0
 def _get_seq_dict(self):
     """Internal reusable function to get the sequence dictionary.
     """
     seq_handle = open(self._test_seq_file)
     seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))
     seq_handle.close()
     return seq_dict
Пример #21
0
def blastclust_to_fasta(infname, seqfname, outdir):
    """Converts input BLASTCLUST output list to a subdirectory of FASTA files.


    Each individual FASTA file contains all sequences from a single cluster.
    The sequences matching the IDs listed in the BLASTCLUST output .lst file 
    should all be found in the same file.

    Returns the output directory and a list of the files, as a tuple.
    """
    outdirname = os.path.join(outdir, "blastclust_OTUs")
    if not os.path.exists(outdirname):
        os.makedirs(outdirname)
    seqdict = SeqIO.index(seqfname, 'fasta')
    outfnames = []
    with open(infname, 'r') as fh:
        otu_id = 0
        for line in fh:
            otu_id += 1
            outfname = os.path.join(outdirname,
                                    "blastclust_OTU_%06d.fasta" % otu_id)
            SeqIO.write((seqdict[key] for key in line.split()),
                        outfname, 'fasta')
            outfnames.append(outfname)
    return (outdirname, outfnames)
Пример #22
0
    def not_t_full_celegans(self):
        """Test the full C elegans chromosome and GFF files.

        This is used to test GFF on large files and is not run as a standard
        test. You will need to download the files and adjust the paths
        to run this.
        """
        # read the sequence information
        seq_file = os.path.join(self._full_dir, "c_elegans.WS199.dna.fa")
        gff_file = os.path.join(self._full_dir, "c_elegans.WS199.gff3")
        seq_handle = open(seq_file)
        seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))
        seq_handle.close()
        #with open(gff_file) as gff_handle:
        #    possible_limits = feature_adder.available_limits(gff_handle)
        #    pprint.pprint(possible_limits)
        rnai_types = [('Orfeome', 'PCR_product'),
                    ('GenePair_STS', 'PCR_product'),
                    ('Promoterome', 'PCR_product')]
        gene_types = [('Non_coding_transcript', 'gene'),
                      ('Coding_transcript', 'gene'),
                      ('Coding_transcript', 'mRNA'),
                      ('Coding_transcript', 'CDS')]
        limit_info = dict(gff_source_type = rnai_types + gene_types)
        for rec in GFF.parse(gff_file, seq_dict, limit_info=limit_info):
            pass
Пример #23
0
 def test_fasta_out(self):
     """Check FASTQ to FASTA output"""
     records = SeqIO.parse("Quality/example.fastq", "fastq")
     h = StringIO()
     SeqIO.write(records, h, "fasta")
     with open("Quality/example.fasta") as expected:
         self.assertEqual(h.getvalue(), expected.read())
def main(args):
    server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password)
    if args.database_name not in server.keys():
        server.new_database(args.database_name)

    db = server[args.database_name]



    gen = []
    if args.fasta is not None:
        for rec in SeqIO.parse(args.fasta, 'fasta'):
            gen.append(rec.name)
    elif args.genbank is not None:
        for rec in SeqIO.parse(args.genbank, 'genbank'):
            gen.append(rec.name)
    elif args.input is not None:
        with open(args.input) as fp:
            for line in fp:
                gen.append(line.rstrip())

    if args.remove:
        taxon_id = None
    else:
        taxon_id = add_new_taxonomy(server, args.new_taxons, args.taxid)

    for rec in gen:
        server.adaptor.execute('update bioentry set taxon_id = %s where bioentry_id = %s',(taxon_id, db.adaptor.fetch_seqid_by_display_id(db.dbid, rec)))
    server.commit()
Пример #25
0
 def test_fastq_1000(self):
     """Read and write back simple example with mixed case 1000bp read"""
     data = "@%s\n%s\n+\n%s\n" \
            % ("id descr goes here", "ACGTNncgta"*100, "abcd!!efgh"*100)
     handle = StringIO()
     self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq"))
     self.assertEqual(data, handle.getvalue())
Пример #26
0
def count_overlap(filename):
    for seq_record in SeqIO.parse(filename, "fasta"):
        for seq_record_1 in SeqIO.parse(filename, "fasta"):
            s1 = seq_record.seq
            s2 = seq_record_1.seq
            if s1 != s2 and s1[-3:] == s2[0:3]:
                print(seq_record.id + " " + seq_record_1.id)
Пример #27
0
def load_examples_from_fasta(signal, org, data_path):
    """
    load examples from fasta file

    signal 
    """

    fn_pos = "%s/%s_sig_%s_example.fa" % (data_path, signal, "pos")
    fn_neg = "%s/%s_sig_%s_example.fa" % (data_path, signal, "neg")
    print "loading: \n %s \n %s" % (fn_pos, fn_neg)

    # parse file
    xt_pos = [str(rec.seq) for rec in SeqIO.parse(fn_pos, "fasta")]
    xt_neg = [str(rec.seq) for rec in SeqIO.parse(fn_neg, "fasta")]

    labels = [+1] * len(xt_pos) + [-1] * len(xt_neg)
    examples = xt_pos + xt_neg

    print (
        "organism: %s, signal %s,\t num_labels: %i,\t num_examples %i,\t num_positives: %i,\t num_negatives: %i"
        % (org, signal, len(labels), len(examples), len(xt_pos), len(xt_neg))
    )

    examples_shuffled, labels_shuffled = helper.coshuffle(examples, labels)
    ret = {"examples": numpy.array(examples_shuffled), "labels": numpy.array(labels_shuffled)}

    return ret
Пример #28
0
 def _validate_fasta(self, text):
     try:
         SeqIO.parse(text, 'fasta').next()
         return text
     except StopIteration:
         raise argparse.ArgumentTypeError(
             "{0} is not fasta file".format(text))
Пример #29
0
    def standard_test_procedure(self, cline):
        """Standard testing procedure used by all tests."""

        # Overwrite existing files.
        cline.force = True

        # Mark output files for later cleanup.
        self.add_file_to_clean(cline.outfile)
        if cline.guidetree_out:
            self.add_file_to_clean(cline.guidetree_out)

        input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta"))
        self.assertEqual(str(eval(repr(cline))), str(cline))
        output, error = cline()
        self.assertTrue(not output or output.strip().startswith("CLUSTAL"))

        # Test if ClustalOmega executed successfully.
        self.assertTrue(error.strip() == "" or
               error.startswith("WARNING: Sequence type is DNA.") or
               error.startswith("WARNING: DNA alignment is still experimental."))

        # Check the output...
        align = AlignIO.read(cline.outfile, "clustal")
        output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile, "clustal"))
        self.assertEqual(len(set(input_records.keys())), len(set(output_records.keys())))
        for record in align:
            self.assertEqual(str(record.seq), str(output_records[record.id].seq))

        # TODO - Try and parse this with Bio.Nexus?
        if cline.guidetree_out:
            self.assertTrue(os.path.isfile(cline.guidetree_out))
Пример #30
0
    def test_acba_annot(self):
        replicon_filename = 'acba.007.p01.13'
        replicon_id = 'ACBA.007.P01_13'
        command = "integron_finder --outdir {out_dir} --func-annot --path-func-annot {annot_bank} --promoter-attI " \
                  "--gbk --keep-tmp " \
                  "{replicon}".format(out_dir=self.out_dir,
                                      annot_bank=self.resfams_dir,
                                      replicon=self.find_data(os.path.join('Replicons', '{}.fst'.format(replicon_filename)))
                                      )

        with self.catch_io(out=True, err=False):
            main(command.split()[1:], loglevel='WARNING')

        result_dir = os.path.join(self.out_dir, 'Results_Integron_Finder_{}'.format(replicon_filename))

        gbk = '{}.gbk'.format(replicon_id)
        expected_gbk = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename), gbk))
        gbk_test = os.path.join(result_dir, gbk)
        expected_gbk = SeqIO.read(expected_gbk, 'gb')
        gbk_test = SeqIO.read(gbk_test, 'gb')
        self.assertSeqRecordEqual(expected_gbk, gbk_test)

        output_filename = '{}.integrons'.format(replicon_filename)
        expected_result_path = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename),
                                                           output_filename))
        test_result_path = os.path.join(result_dir, output_filename)
        self.assertIntegronResultEqual(expected_result_path, test_result_path)

        output_filename = os.path.join('tmp_{}'.format(replicon_id), replicon_id + '_Resfams_fa_table.res')
        expected_result_path = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename),
                                                           output_filename))
        test_result_path = os.path.join(result_dir, output_filename)
        self.assertHmmEqual(expected_result_path, test_result_path)