Exemplo n.º 1
0
def gather_est2genome_seqs(refseq_obj, est2genome_handle, log_line, velvet_file):
	seq_dir = log_line.split("\t")[1]
	tmp_refseq = seq_dir.split("/")[3].replace(".","%2E")#hardcoded in this position
	gff_file = refseq_obj.id + ".velvet_contigs.maker.output/" + seq_dir + "/" + tmp_refseq + ".gff"
	gff_handle = open(gff_file,'r')
	for gff_line in gff_handle:
		if(re.search("est2gneome",gff_line) and \
		re.search("\texpressed_sequence_match\t",gff_line)):
			curr_start = int(gff_line.split("\t")[3])
			curr_stop = int(gff_line.split("\t")[4])
			curr_strand = gff_line.split("\t")[6]
			
			tmp_handle = open(velvet_file,'r')
			tmp_fasta = SeqIO.to_dict(SeqIO.parse(tmp_handle,"fasta"))
			tmp_handle.close()
		
			if seq_dir.split("/")[3] in tmp_fasta:
				curr_record = tmp_fasta[seq_dir.split("/")[3]]
			else:
				continue
			new_seq = curr_record.seq[curr_start - 1:curr_stop]
			if(curr_strand == "-"):
				new_seq = curr_record.seq[curr_start - 1:curr_stop].reverse_complement()
			new_record = SeqRecord(new_seq,id=seqname,name=seqname,description="")
				
			SeqIO.write(est2genome_handle,"fasta")
Exemplo n.º 2
0
    def standard_test_procedure(self, cline):
        """Standard testing procedure used by all tests."""

        # Overwrite existing files.
        cline.force = True

        # Mark output files for later cleanup.
        self.add_file_to_clean(cline.outfile)
        if cline.guidetree_out:
            self.add_file_to_clean(cline.guidetree_out)

        input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta"))
        self.assertEqual(str(eval(repr(cline))), str(cline))
        output, error = cline()
        self.assertTrue(not output or output.strip().startswith("CLUSTAL"))

        # Test if ClustalOmega executed successfully.
        self.assertTrue(error.strip() == "" or
               error.startswith("WARNING: Sequence type is DNA.") or
               error.startswith("WARNING: DNA alignment is still experimental."))

        # Check the output...
        align = AlignIO.read(cline.outfile, "clustal")
        output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile, "clustal"))
        self.assertEqual(len(set(input_records.keys())), len(set(output_records.keys())))
        for record in align:
            self.assertEqual(str(record.seq), str(output_records[record.id].seq))

        # TODO - Try and parse this with Bio.Nexus?
        if cline.guidetree_out:
            self.assertTrue(os.path.isfile(cline.guidetree_out))
Exemplo n.º 3
0
def load_examples_from_fasta(signal, org, data_path):
    """
    load examples from fasta file

    signal 
    """

    fn_pos = "%s/%s_sig_%s_example.fa" % (data_path, signal, "pos")
    fn_neg = "%s/%s_sig_%s_example.fa" % (data_path, signal, "neg")
    print "loading: \n %s \n %s" % (fn_pos, fn_neg)

    # parse file
    xt_pos = [str(rec.seq) for rec in SeqIO.parse(fn_pos, "fasta")]
    xt_neg = [str(rec.seq) for rec in SeqIO.parse(fn_neg, "fasta")]

    labels = [+1] * len(xt_pos) + [-1] * len(xt_neg)
    examples = xt_pos + xt_neg

    print (
        "organism: %s, signal %s,\t num_labels: %i,\t num_examples %i,\t num_positives: %i,\t num_negatives: %i"
        % (org, signal, len(labels), len(examples), len(xt_pos), len(xt_neg))
    )

    examples_shuffled, labels_shuffled = helper.coshuffle(examples, labels)
    ret = {"examples": numpy.array(examples_shuffled), "labels": numpy.array(labels_shuffled)}

    return ret
Exemplo n.º 4
0
def filter_reads_by_length(fq1, fq2, quality_format, min_length=20):
    """
    removes reads from a pair of fastq files that are shorter than
    a minimum length. removes both ends of a read if one end falls
    below the threshold while maintaining the order of the reads

    """

    logger.info("Removing reads in %s and %s that "
                "are less than %d bases." % (fq1, fq2, min_length))
    fq1_out = utils.append_stem(fq1, ".fixed")
    fq2_out = utils.append_stem(fq2, ".fixed")
    fq1_single = utils.append_stem(fq1, ".singles")
    fq2_single = utils.append_stem(fq2, ".singles")
    if all(map(utils.file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])):
        return [fq1_out, fq2_out]

    fq1_in = SeqIO.parse(fq1, quality_format)
    fq2_in = SeqIO.parse(fq2, quality_format)

    with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle:
        for fq1_record, fq2_record in izip(fq1_in, fq2_in):
            if len(fq1_record.seq) >= min_length and len(fq2_record.seq) >= min_length:
                fq1_out_handle.write(fq1_record.format(quality_format))
                fq2_out_handle.write(fq2_record.format(quality_format))
            else:
                if len(fq1_record.seq) > min_length:
                    fq1_single_handle.write(fq1_record.format(quality_format))
                if len(fq2_record.seq) > min_length:
                    fq2_single_handle.write(fq2_record.format(quality_format))

    return [fq1_out, fq2_out]
Exemplo n.º 5
0
 def loop(self, filename, format):
     original_records = list(SeqIO.parse(open(filename, "rU"), format))
     # now open a connection to load the database
     server = BioSeqDatabase.open_database(driver = DBDRIVER,
                                           user = DBUSER, passwd = DBPASSWD,
                                           host = DBHOST, db = TESTDB)
     db_name = "test_loop_%s" % filename  # new namespace!
     db = server.new_database(db_name)
     count = db.load(original_records)
     self.assertEqual(count, len(original_records))
     server.commit()
     #Now read them back...
     biosql_records = [db.lookup(name=rec.name)
                       for rec in original_records]
     #And check they agree
     self.assertTrue(compare_records(original_records, biosql_records))
     #Now write to a handle...
     handle = StringIO()
     SeqIO.write(biosql_records, handle, "gb")
     #Now read them back...
     handle.seek(0)
     new_records = list(SeqIO.parse(handle, "gb"))
     #And check they still agree
     self.assertEqual(len(new_records), len(original_records))
     for old, new in zip(original_records, new_records):
         #TODO - remove this hack because we don't yet write these (yet):
         for key in ["comment", "references", "db_source"]:
             if key in old.annotations and key not in new.annotations:
                 del old.annotations[key]
         self.assertTrue(compare_record(old, new))
     #Done
     server.close()
Exemplo n.º 6
0
 def test_fastq_1000(self):
     """Read and write back simple example with mixed case 1000bp read"""
     data = "@%s\n%s\n+\n%s\n" \
            % ("id descr goes here", "ACGTNncgta"*100, "abcd!!efgh"*100)
     handle = StringIO()
     self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq"))
     self.assertEqual(data, handle.getvalue())
Exemplo n.º 7
0
 def test_generated(self):
     """Write and read back odd SeqRecord objects"""
     record1 = SeqRecord(Seq("ACGT"*500, generic_dna),  id="Test", description="Long "*500,
                        letter_annotations={"phred_quality":[40,30,20,10]*500})
     record2 = SeqRecord(MutableSeq("NGGC"*1000),  id="Mut", description="very "*1000+"long",
                        letter_annotations={"phred_quality":[0,5,5,10]*1000})
     record3 = SeqRecord(UnknownSeq(2000,character="N"),  id="Unk", description="l"+("o"*1000)+"ng",
                        letter_annotations={"phred_quality":[0,1]*1000})
     record4 = SeqRecord(Seq("ACGT"*500),  id="no_descr", description="", name="",
                        letter_annotations={"phred_quality":[40,50,60,62]*500})
     record5 = SeqRecord(Seq("",generic_dna),  id="empty_p", description="(could have been trimmed lots)",
                        letter_annotations={"phred_quality":[]})
     record6 = SeqRecord(Seq(""),  id="empty_s", description="(could have been trimmed lots)",
                        letter_annotations={"solexa_quality":[]})
     record7 = SeqRecord(Seq("ACNN"*500),  id="Test_Sol", description="Long "*500,
                        letter_annotations={"solexa_quality":[40,30,0,-5]*500})
     record8 = SeqRecord(Seq("ACGT"),  id="HighQual", description="With very large qualities that even Sanger FASTQ can't hold!",
                        letter_annotations={"solexa_quality":[0,10,100,1000]})
     #TODO - Record with no identifier?
     records = [record1, record2, record3, record4, record5, record6, record7, record8]
     #TODO - Have a Biopython defined "DataLossWarning?"
     warnings.simplefilter('ignore', BiopythonWarning)
     #TODO - Include phd output?
     for format in ["fasta", "fastq", "fastq-solexa", "fastq-illumina", "qual"]:
         handle = StringIO()
         SeqIO.write(records, handle, format)
         handle.seek(0)
         compare_records(records,
                         list(SeqIO.parse(handle, format)),
                         truncation_expected(format))
     warnings.filters.pop()
Exemplo n.º 8
0
 def test_fasta_out(self):
     """Check FASTQ to FASTA output"""
     records = SeqIO.parse("Quality/example.fastq", "fastq")
     h = StringIO()
     SeqIO.write(records, h, "fasta")
     with open("Quality/example.fasta") as expected:
         self.assertEqual(h.getvalue(), expected.read())
Exemplo n.º 9
0
 def test_fastq_2000(self):
     """Read and write back simple example with upper case 2000bp read"""
     data = "@%s\n%s\n+\n%s\n" \
            % ("id descr goes here", "ACGT"*500, "!@a~"*500)
     handle = StringIO()
     self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq"))
     self.assertEqual(data, handle.getvalue())
Exemplo n.º 10
0
def blastclust_to_fasta(infname, seqfname, outdir):
    """Converts input BLASTCLUST output list to a subdirectory of FASTA files.


    Each individual FASTA file contains all sequences from a single cluster.
    The sequences matching the IDs listed in the BLASTCLUST output .lst file 
    should all be found in the same file.

    Returns the output directory and a list of the files, as a tuple.
    """
    outdirname = os.path.join(outdir, "blastclust_OTUs")
    if not os.path.exists(outdirname):
        os.makedirs(outdirname)
    seqdict = SeqIO.index(seqfname, 'fasta')
    outfnames = []
    with open(infname, 'r') as fh:
        otu_id = 0
        for line in fh:
            otu_id += 1
            outfname = os.path.join(outdirname,
                                    "blastclust_OTU_%06d.fasta" % otu_id)
            SeqIO.write((seqdict[key] for key in line.split()),
                        outfname, 'fasta')
            outfnames.append(outfname)
    return (outdirname, outfnames)
Exemplo n.º 11
0
def run_pal2nal(fname_aln, fname_nuc, fname_prot):
    """
    Generate a codon alignment via PAL2NAL.

    @param fname_aln:
        MSA of protein sequences in CLUSTAL format (.aln)
    @param fname_nuc:
        Nucleotide sequences in FASTA format (.fasta)
    @param fname_prot:
        Protein sequences in FASTA format (.fasta)
    @return:
        Codon alignment in CLUSTAL format (.aln), suitable for codeml
    1"""
    sys.stderr.write("\nSTEP: run_pal2nal(%s, %s)\n" % (fname_aln, fname_nuc))

    # Reorder fname_nuc according to the order of the proteins in fname_aln, which
    # was reordered due to CLUSTALW2.  Note that the first protein in each of
    # these files remains the same as at the start, however; this first protein
    # is our original query protein.
    nuc_records = [record for record in SeqIO.parse(fname_nuc, "fasta")]
    prot_records = [record for record in SeqIO.parse(fname_prot, "fasta")]
    records_map = dict((pr.id, nr) for pr, nr in zip(prot_records, nuc_records))
    fname_nuc2 = "homologs_ordered.dna.fasta"
    with open(fname_nuc2, "w") as f:
        for record in SeqIO.parse(fname_aln, "clustal"):
            SeqIO.write(records_map[record.id], f, "fasta")
    fname_codon = "homologs.codon.aln"
    # TODO: use subprocess
    os.system("%s/pal2nal.pl %s %s -output paml > %s" % (bin_dir(), fname_aln, fname_nuc2, fname_codon))
    return fname_codon
Exemplo n.º 12
0
def main(gbdir, outdir):
    os.makedirs(gbdir, exist_ok=True)
    os.makedirs(outdir, exist_ok=True)
    tempq = 'tempquery.fasta'
    tempdb = 'tempdb.fasta'
    for org in tqdm(Organism.objects.all()):
        # get genbank and convert to fasta
        fpath = os.path.join(gbdir, '{}.gb'.format(org.accession))
        if not os.path.isfile(fpath):
            print('\nFetching {} with accession {}'.format(
                org.name,
                org.accession
            ))
            fetch(fpath)
        SeqIO.convert(fpath, 'genbank', tempdb, 'fasta')
        # get spacers of organism and convert to fasta
        spacers = Spacer.objects.filter(loci__organism=org)
        fastatext = ''.join(['>{}\n{}\n'.format(spacer.id, spacer.sequence)
                             for spacer in spacers])
        with open(tempq, 'w') as f:
            f.write(fastatext)
        # run blast and save output
        outpath = os.path.join(outdir, '{}.json'.format(org.accession))
        commandargs = ['blastn', '-query', tempq,
                       '-subject', tempdb, '-out', outpath, '-outfmt', '15']
        subprocess.run(commandargs, stdout=subprocess.DEVNULL)

    os.remove(tempq)
    os.remove(tempdb)
Exemplo n.º 13
0
def needle_score(seq1, seq2, verbose=False, keep=False):
    """
    get needlman-wunsch score for aligning two sequences
    """
    ntf = tempfile.NamedTemporaryFile
    with ntf(prefix='seq1', delete = not keep) as fh1, \
         ntf(prefix='seq2', delete = not keep) as fh2, \
         ntf(prefix='align_out') as outfile, \
         open(os.devnull) as dn:
        SeqIO.write(seq1, fh1, 'fasta')
        fh1.flush()
        SeqIO.write(seq2, fh2, 'fasta')
        fh2.flush()

        cmd = ['needle', '-gapopen', '0',
               '-gapextend', '0',
               '-outfile',  outfile.name,
               fh1.name, fh2.name]
        if verbose:
            print(' '.join(cmd))
        subprocess.check_call(cmd, stderr=dn)
        result = outfile.read()
        pattern = re.compile(r'# Score: (.*)')
        score = pattern.search(result)
        if score is not None:
            return float(score.group(1))
        return 0
Exemplo n.º 14
0
def cluster_pid(folder):
    result = []
    f_name = folder.split("/")[-1]
    try:
        genes = pd.read_csv(folder + "/report/" + f_name + "_genes.csv")
        genes = genes.loc[~(genes['cluster'].isin(['na', '0', 0])) & (genes['species'] == 'H**o sapiens')]
        if genes.shape[0] > 0:
            for cluster in set(genes['cluster']):
                pids = []
                accs = genes.loc[genes['cluster'] == cluster, 'prot_acc'].values
                for seq1 in accs:
                    for seq2 in accs:
                        seq_1 = [x.seq for x in SeqIO.parse("../cgpf_ncbi/all_seqs.fa", 'fasta') if
                                 x.name.split("|")[2] == seq1]
                        seq_2 = [x.seq for x in SeqIO.parse("../cgpf_ncbi/all_seqs.fa", 'fasta') if
                                 x.name.split("|")[2] == seq2]
                        aln = pairwise2.align.globalxx(seq_1[0], seq_2[0])[0]
                        mean_len = (len(aln[0]) + len(aln[1])) / 2
                        pids.append(aln[2] / mean_len)

                n_genes = len(pids)
                mean_pid = np.mean(pids)
                sd_id = np.std(pids)
                result.append(cluster, n_genes, mean_pid, sd_id)
                print(cluster)
        return result
    except OSError:
        return None
Exemplo n.º 15
0
 def setUp(self):
     self.aln_file = [TEST_ALIGN_FILE1,
                      TEST_ALIGN_FILE2,
                      TEST_ALIGN_FILE3,
                      TEST_ALIGN_FILE4,
                      TEST_ALIGN_FILE5,
                      TEST_ALIGN_FILE6]
     alns = []
     for i in self.aln_file:
         if i[1] == 'parse':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet)
         elif i[1] == 'index':
             nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20)
         elif i[1] == 'id':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with open(i[0][2]) as handle:
                 id = dict((i.split()[0], i.split()[1]) for i in handle)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet)
         alns.append(caln)
         nucl.close()  # Close the indexed FASTA file
     self.alns = alns
Exemplo n.º 16
0
 def _validate_fasta(self, text):
     try:
         SeqIO.parse(text, 'fasta').next()
         return text
     except StopIteration:
         raise argparse.ArgumentTypeError(
             "{0} is not fasta file".format(text))
Exemplo n.º 17
0
Arquivo: z4.py Projeto: strop/WBO_11
def main_build_markov(promotor_filename = "promotor.fa", genome_filename = "genom.fa", symbol_length = 2, load_cached = False, save_cache = True):
  ''' Na podstawie plików z sekwencjami promotorowymi i genomem funkcja buduje model Markova'''
  promotor_sequences = [ x for x in SeqIO.parse("promotor.fa", "fasta")]
  genome = [ x for x in SeqIO.parse("genom.fa", "fasta")]
  if not load_cached:
    promotor_freqs = calc_symbol_freq(promotor_sequences)
    genome_freqs = calc_symbol_freq(genome)
    if save_cache:
      dump_obj(promotor_freqs, Dumpfiles.promotor_freq)
      dump_obj(genome_freqs, Dumpfiles.genome_freq)
  else:
    promotor_freqs = load_obj(Dumpfiles.promotor_freq)
    genome_freqs = load_obj(Dumpfiles.genome_freq)
  
  promotor_counts = calc_counts(promotor_sequences)
  genome_counts = calc_counts(genome)

  print promotor_counts

  promotor_freqs = fold_and_normalize(promotor_freqs[symbol_length], symbol_length, promotor_counts[symbol_length])
  genome_freqs = fold_and_normalize(genome_freqs[symbol_length], symbol_length, genome_counts[symbol_length])

  for k in promotor_freqs:
    assert(k in genome_freqs)
  for k in genome_freqs:
    assert(k in genome_freqs)

  print promotor_freqs
  (markov, states) = build_markov(genome_freqs, promotor_freqs)

  return (markov, states)
def main(args):
    server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password)
    if args.database_name not in server.keys():
        server.new_database(args.database_name)

    db = server[args.database_name]



    gen = []
    if args.fasta is not None:
        for rec in SeqIO.parse(args.fasta, 'fasta'):
            gen.append(rec.name)
    elif args.genbank is not None:
        for rec in SeqIO.parse(args.genbank, 'genbank'):
            gen.append(rec.name)
    elif args.input is not None:
        with open(args.input) as fp:
            for line in fp:
                gen.append(line.rstrip())

    if args.remove:
        taxon_id = None
    else:
        taxon_id = add_new_taxonomy(server, args.new_taxons, args.taxid)

    for rec in gen:
        server.adaptor.execute('update bioentry set taxon_id = %s where bioentry_id = %s',(taxon_id, db.adaptor.fetch_seqid_by_display_id(db.dbid, rec)))
    server.commit()
Exemplo n.º 19
0
def count_overlap(filename):
    for seq_record in SeqIO.parse(filename, "fasta"):
        for seq_record_1 in SeqIO.parse(filename, "fasta"):
            s1 = seq_record.seq
            s2 = seq_record_1.seq
            if s1 != s2 and s1[-3:] == s2[0:3]:
                print(seq_record.id + " " + seq_record_1.id)
Exemplo n.º 20
0
 def _get_seq_dict(self):
     """Internal reusable function to get the sequence dictionary.
     """
     seq_handle = open(self._test_seq_file)
     seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))
     seq_handle.close()
     return seq_dict
Exemplo n.º 21
0
def illumina2sangerFq(inputfile):
    
    print help(SeqIO.convert)
    
    filename = inputfile[:-3]+'.fastq'
    
    SeqIO.convert(inputfile, "fastq-illumina", filename, "fastq")
Exemplo n.º 22
0
def CutOutDomain(coords,filename, header=False, column_id=0, column_start=8, column_stop=9):
    """COMMENTS"""
    from Bio import SeqIO
    fh=open(coords)
    seqfile=open(filename)
    Towrite=[]
    CoordIDDic={}
    if header==True:
        print 'header set to True, first line of %s will be ignored'%coords
        skip_header=fh.readline()
    else:
        print 'header not set to True, first line of %s will be processed'%coords

    for unformatedLine in fh:
        l=unformatedLine.replace('\xa0', '').strip().split(',')
        if l[column_id] not in CoordIDDic:
            CoordIDDic[l[column_id]]=l[column_start], l[column_stop]
    else:
        for s in SeqIO.parse(seqfile, 'fasta'):
            if s.id in CoordIDDic:
                start=(int(CoordIDDic.get(s.id)[0])-1)
                stop=int(CoordIDDic.get(s.id)[1])
                s.id=s.id+'_%s_%s'%((start+1), stop)
                Towrite.append(s[start:stop])
        else:
            Output=open('CutOutdomain_%s'%filename, 'w')
            SeqIO.write(Towrite, Output, 'fasta')
Exemplo n.º 23
0
def frameshift_writer(contigs, file):
    sys.stderr.write("[predict] writing frameshifts...")
    seqs = [SeqRecord(seq=c.seq, id=c.id, description=c.description) for c in contigs.values()
            if c.annotation['majority_frameshift']]
    SeqIO.write(seqs, file, "fasta")
    file.close()
    sys.stderr.write("\tdone.\n")
Exemplo n.º 24
0
def no_relatives_writer(contigs, file):
    sys.stderr.write("[predict] writing contigs with no relatives...")
    seqs = [SeqRecord(seq=c.seq, id=c.id, description=c.description) for c in contigs.values() if
            c.annotation['num_relatives'] == 0]
    SeqIO.write(seqs, file, "fasta")
    file.close()
    sys.stderr.write("\tdone.\n")
Exemplo n.º 25
0
    def __format__(self, format_spec):
        """Returns the record as a string in the specified file format.

        This method supports the python format() function added in
        Python 2.6/3.0.  The format_spec should be a lower case string
        supported by Bio.SeqIO as an output file format. See also the
        SeqRecord's format() method.
        """
        if not format_spec:
            #Follow python convention and default to using __str__
            return str(self)    
        from Bio import SeqIO
        if format_spec in SeqIO._BinaryFormats:
            #Return bytes on Python 3
            try:
                #This is in Python 2.6+, but we need it on Python 3
                from io import BytesIO
                handle = BytesIO()
            except ImportError:
                #Must be on Python 2.5 or older
                from StringIO import StringIO
                handle = StringIO()
        else:
            from StringIO import StringIO
            handle = StringIO()
        SeqIO.write(self, handle, format_spec)
        return handle.getvalue()
Exemplo n.º 26
0
def main():
    if len (sys.argv) != 4 :
        print "Please provide file, the file format, and the desired file format "
        sys.exit (1)
    else:
        f = sys.argv[1]
        fout = "".join(f.split('.')[:-1])
        formatin = sys.argv[2]
        formatout  = sys.argv[3]
        if formatout == 'nexus':
            AlignIO.convert(f,formatin,fout+'.'+formatout,formatout,alphabet= IUPAC.ambiguous_dna)
        if formatout == 'mega':
            handle = open(f, "rU")
            record_dict = SeqIO.to_dict(SeqIO.parse(handle, "phylip-relaxed"))
            handle.close()
            
            outfile = open(fout+'.'+formatout,'w')
            outfile.write('#mega'+"\n")
            outfile.write('!Title Mytitle;'+"\n")
            outfile.write('!Format DataType=DNA indel=-;'+"\n\n")
            
            for n in record_dict:
                outfile.write('#'+n+"\n")
                newseq=wrap(str(record_dict[n].seq),60)
                for s in newseq:
                    outfile.write(s+"\n")
            
            outfile.close()
        else:
            AlignIO.convert(f,formatin,fout+'.'+formatout,formatout)
Exemplo n.º 27
0
    def not_t_full_celegans(self):
        """Test the full C elegans chromosome and GFF files.

        This is used to test GFF on large files and is not run as a standard
        test. You will need to download the files and adjust the paths
        to run this.
        """
        # read the sequence information
        seq_file = os.path.join(self._full_dir, "c_elegans.WS199.dna.fa")
        gff_file = os.path.join(self._full_dir, "c_elegans.WS199.gff3")
        seq_handle = open(seq_file)
        seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))
        seq_handle.close()
        #with open(gff_file) as gff_handle:
        #    possible_limits = feature_adder.available_limits(gff_handle)
        #    pprint.pprint(possible_limits)
        rnai_types = [('Orfeome', 'PCR_product'),
                    ('GenePair_STS', 'PCR_product'),
                    ('Promoterome', 'PCR_product')]
        gene_types = [('Non_coding_transcript', 'gene'),
                      ('Coding_transcript', 'gene'),
                      ('Coding_transcript', 'mRNA'),
                      ('Coding_transcript', 'CDS')]
        limit_info = dict(gff_source_type = rnai_types + gene_types)
        for rec in GFF.parse(gff_file, seq_dict, limit_info=limit_info):
            pass
def splitFastaFile(infile, informat, outdir):
    for record in SeqIO.parse(open(infile), informat):
        iid = record.id
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        f_out = os.path.join(outdir,iid+'.fasta')
        SeqIO.write([record],open(f_out,'w'),"fasta")
Exemplo n.º 29
0
def check_convert_fails(in_filename, in_format, out_format, alphabet=None):
    qual_truncate = truncation_expected(out_format)
    #We want the SAME error message from parse/write as convert!
    err1 = None
    try:
        records = list(SeqIO.parse(in_filename,in_format, alphabet))
        handle = StringIO()
        if qual_truncate:
            warnings.simplefilter('ignore', UserWarning)
        SeqIO.write(records, handle, out_format)
        if qual_truncate:
            warnings.filters.pop()
        handle.seek(0)
        assert False, "Parse or write should have failed!"
    except ValueError as err:
        err1 = err
    #Now do the conversion...
    try:
        handle2 = StringIO()
        if qual_truncate:
            warnings.simplefilter('ignore', UserWarning)
        SeqIO.convert(in_filename, in_format, handle2, out_format, alphabet)
        if qual_truncate:
            warnings.filters.pop()
        assert False, "Convert should have failed!"
    except ValueError as err2:
        assert str(err1) == str(err2), \
               "Different failures, parse/write:\n%s\nconvert:\n%s" \
               % (err1, err2)
Exemplo n.º 30
0
    def test_acba_annot(self):
        replicon_filename = 'acba.007.p01.13'
        replicon_id = 'ACBA.007.P01_13'
        command = "integron_finder --outdir {out_dir} --func-annot --path-func-annot {annot_bank} --promoter-attI " \
                  "--gbk --keep-tmp " \
                  "{replicon}".format(out_dir=self.out_dir,
                                      annot_bank=self.resfams_dir,
                                      replicon=self.find_data(os.path.join('Replicons', '{}.fst'.format(replicon_filename)))
                                      )

        with self.catch_io(out=True, err=False):
            main(command.split()[1:], loglevel='WARNING')

        result_dir = os.path.join(self.out_dir, 'Results_Integron_Finder_{}'.format(replicon_filename))

        gbk = '{}.gbk'.format(replicon_id)
        expected_gbk = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename), gbk))
        gbk_test = os.path.join(result_dir, gbk)
        expected_gbk = SeqIO.read(expected_gbk, 'gb')
        gbk_test = SeqIO.read(gbk_test, 'gb')
        self.assertSeqRecordEqual(expected_gbk, gbk_test)

        output_filename = '{}.integrons'.format(replicon_filename)
        expected_result_path = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename),
                                                           output_filename))
        test_result_path = os.path.join(result_dir, output_filename)
        self.assertIntegronResultEqual(expected_result_path, test_result_path)

        output_filename = os.path.join('tmp_{}'.format(replicon_id), replicon_id + '_Resfams_fa_table.res')
        expected_result_path = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename),
                                                           output_filename))
        test_result_path = os.path.join(result_dir, output_filename)
        self.assertHmmEqual(expected_result_path, test_result_path)
Exemplo n.º 31
0
def Screen_seqs_against_fasta(record_fasta, ref_fasta, word_size=17, allowed_hits=0,
                              exclude_names=_adaptor_site_names, check_rc=True,
                              save=False, save_folder=None, save_name=None,
                              overwrite=False, return_kept_flag=False, verbose=True):
    """Function to screen sequences against a given fasta file
    Inputs:
        record_fasta: fasta filename or list of SeqRecord, str or list
        ref_fasta: filename for reference fasta file to screen against, string of file path
        word_size: word_size used for probe screening, int (default: 17)
        allowed_hits: allowed hits for one probe in the fasta, int (default: 8)
        exclude_names: list of names to be excluded, list (default: _adaptor_site_names)
        check_rc: whether check reverse-complement of the probe, bool (default: True)
        save: whether save result probe reports, bool (default: True)
        save_folder: folder to save selected probes, string of path (default: None, which means +'_filtered')
        overwrite: whether overwrite existing result probe reports, bool (default: False)
        return_kept_flag: whether return flags for whether keeping the record, bool (default:False)
        verbose: say something!, bool (default: True)
    """
    ## Check inputs
    if verbose:
        print(f"- Screen sequences against given fasta file:{ref_fasta}")
    # load record-fasta
    if isinstance(record_fasta, str):
        with open(record_fasta, 'r') as _handle:
            _records = []
            for _record in SeqIO.parse(_handle, "fasta"):
                _records.append(_record)
    elif isinstance(record_fasta, list):
        _records = record_fasta
    if verbose:
        print(f"-- {len(_records)} sequences loaded.")

    if not os.path.isfile(ref_fasta):
        raise IOError(f"Reference fasta:{ref_fasta} is not a file.")
    word_size = int(word_size)
    allowed_hits = int(allowed_hits)
    if save_folder is None:
        if isinstance(record_fasta, str):
            save_folder = os.path.dirname(record_fasta)
        else:
            save_folder = os.path.dirname(ref_fasta)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
        if verbose:
            print(f"-- create {save_folder} to store filter probes")

    ## construct table for ref_fasta
    if verbose:
        print(f"-- constructing reference table for fasta file")
    _ref_names, _ref_seqs = ld.fastaread(ref_fasta, force_upper=True)
    # filter sequences by given reference name
    _kept_ref_seqs = []
    for _n, _s in zip(_ref_names, _ref_seqs):
        if _n.split(' ')[0] in exclude_names:
            continue
        else:
            _kept_ref_seqs.append(_s)

    _ref_table = ld.OTmap(_kept_ref_seqs, word_size, use_kmer=True)

    ## filter records
    if check_rc:
        _hits = [_ref_table.get(str(_r.seq), rc=True) +
                 _ref_table.get(str(_r.seq), rc=False)
                 for _r in _records]
    else:
        _hits = [_ref_table.get(str(_r.seq), rc=False) for _r in _records]
    # filter
    _kept_records = [_r for _r, _h in zip(
        _records, _hits) if _h <= allowed_hits]
    if return_kept_flag:
        _kept_flags = [_h <= allowed_hits for _h in _hits]
    if verbose:
        print(
            f"-- {len(_kept_records)} sequences kept by allowing hits:{allowed_hits}")
    ## Save
    if save:
        if save_name is None and not isinstance(record_fasta, str):
            print(f"Save name not given in either save_name kwd and record_fasta, skip.")
        elif save_name is None:
            save_name = os.path.basename(record_fasta)
        if '.fasta' not in save_name:
            save_name += '.fasta'
        save_filename = os.path.join(save_folder, save_name)
        with open(save_filename, 'wb') as _output_handle:
            if verbose:
                print(
                    f"-- saving {len(_kept_records)} kept records in file:{save_filename}")
            SeqIO.write(_kept_records, _output_handle, "fasta")

    if return_kept_flag:
        return _kept_records, np.array(_kept_flags, dtype=np.bool)
    else:
        return _kept_records
Exemplo n.º 32
0
import sys
import random
from Bio import SeqIO
import sys
import os

file_in = "all_4k_prediction.out"

result = open(file_in, 'r')

contig_list = set()
for line in result:
    line = line.rstrip()
    fields = line.split()
    if float(fields[2]) >= 0.99995837748:
        contig_list.add(fields[0])

input_seq_iterator = SeqIO.parse(open("all_4k_contigs.fa", "rU"), "fasta")
long_list = []
for record in input_seq_iterator:
    #   print record.name
    #   print record.id
    #   exit()
    if record.name in contig_list:
        long_list.append(record)

output_handle = open("all_4k_contigs_top1k.fa", "w")
SeqIO.write(long_list, output_handle, "fasta")
output_handle.close()
#Takes fasta and fasta from centroid and merges to bank file
from Bio import SeqIO
import sys
rna_path = sys.argv[1]
central_path = sys.argv[2]
no_black = True
records = list(SeqIO.parse(rna_path, "fasta"))
seconds = list(SeqIO.parse(central_path, "fasta"))
used_seqs = set()
for id in range(len(records)):
    is_black = True
    record = records[id]
    l = len(record.seq)
    if no_black:
        for el in seconds[id].seq[l:l + l]:
            if el != ".":
                is_black = False
    if (record.seq not in used_seqs) and not (no_black and is_black):
        # print(id)
        used_seqs.add(record.seq)

        print("# File", record.id)
        print("# External source : RNACentral")
        print("# Type : ?")
        print("# Length :", l)
        print("# Description :", ' '.join(record.description.split(" ")[1:]))
        print()
        print(record.seq.transcribe())
        print(seconds[id].seq[l:l + l])
        print()
def getRevComp(file):
    count = 0
    for record in SeqIO.parse(file, 'fasta'):
        if record.seq.reverse_complement() == record.seq:
            count += 1
    print(count)
Exemplo n.º 35
0
import sys

_p = "/home/ksimmon/reference/ard/"
sys.stderr.write("Retrieving antibiotic resistance genes\n")
_l = set([])

descriptions = {}
for i in open(_p + "categories.txt"):
    v = i.strip().split("\t")
    name = ".".join(v[0].split(".")[:-1])
    descriptions.update({name: v})

aro_tags = {}
for i in open(_p + "AROtags.txt"):
    v = i.strip().split("\t")
    #print v
    aro_tags.update({v[2]: v[1]})

print aro_tags
for s in SeqIO.parse(_p + "ARmeta-genes.fa", "fasta"):

    id = s.description.split(" ")[0]
    species = s.description[s.description.rfind("[") +
                            1:s.description.rfind("]")]
    aro_tag = [
        i.split(" ")[0] for i in s.description.split(". ")
        if "ARO:" in i and "ARO:1000001" not in i
    ]
    print id, species, descriptions[id][1], ",".join(
        [aro_tags[tag] for tag in aro_tag])
Exemplo n.º 36
0
# Converting hits to nucleotide coordinates
nucleotide_blast = {x: [] for x in gene_ids if x not in blast_missed}
for gene_id in nucleotide_blast:
    nucleotide_blast[gene_id] =\
            convert_coord_dict(features[gene_id],
                               coordinate_sets[gene_id])

# Extract the gene sequences, if args.f is set, and export data
if args.f:
    print('Loading sequences...', file=stderr)
    # Run without Biopython, if FASTA is not supplied
    from Bio import SeqIO
    processed = set()
    with open(args.f + '.genes', mode='w+') as gene_fasta:
        tsv = open(args.e, mode='w')
        for record in SeqIO.parse(open(args.f), 'fasta'):
            if record.id in features_by_source:
                for feature in features_by_source[record.id]:
                    processed.add(feature.get_id_prefix())
                    segment = record[feature.start -
                                     args.flank_size:feature.end +
                                     args.flank_size]
                    if feature.strand == '-':
                        segment = segment.reverse_complement()
                    segment.id = feature.get_id_prefix()
                    segment.description = ''
                    try:
                        blast_regions = deepcopy(
                            nucleotide_blast[feature.get_id_prefix()])
                    except KeyError:
                        continue
Exemplo n.º 37
0
def main(args):
    server = BioSeqDatabase.open_database(driver=args.driver,
                                          db=args.database,
                                          user=args.user,
                                          host=args.host,
                                          passwd=args.password)

    tax_name = False
    try:
        ncbi_tax = int(args.taxid)
    except ValueError:
        tax_name = True

    if not tax_name:
        print("interpreting as an NCBI taxon ID...", file=sys.stderr)
        taxon_id_lookup_sql = "SELECT bioentry_id, taxon_id, biodatabase.name FROM bioentry JOIN "\
                "biodatabase USING(biodatabase_id) WHERE taxon_id IN "\
                "(SELECT DISTINCT include.taxon_id FROM taxon "\
                "INNER JOIN taxon as include ON (include.left_value "\
                "BETWEEN taxon.left_value AND taxon.right_value) "\
                "WHERE taxon.ncbi_taxon_id  = %s AND include.right_value = include.left_value + 1)"

        rows = server.adaptor.execute_and_fetchall(taxon_id_lookup_sql,
                                                   (ncbi_tax, ))
    else:
        print("interpreting as a taxon name...", file=sys.stderr)
        taxon_name_lookup_sql = "SELECT bioentry_id, taxon_id, biodatabase.name FROM bioentry JOIN "\
                "biodatabase USING(biodatabase_id) WHERE taxon_id IN "\
                "(SELECT DISTINCT include.taxon_id FROM taxon "\
                "INNER JOIN taxon as include ON (include.left_value "\
                "BETWEEN taxon.left_value AND taxon.right_value) "\
                "WHERE taxon.taxon_id IN (SELECT taxon_id FROM taxon_name "\
                "WHERE name like %s) AND include.right_value = include.left_value + 1)"
        rows = server.adaptor.execute_and_fetchall(taxon_name_lookup_sql,
                                                   (args.taxid, ))

    if args.feature_type is not None:
        types = args.feature_type
    elif args.output_format == 'feat-prot':
        types = ['CDS']
    elif args.output_format == 'feat-nucl':
        types = ['CDS', 'rRNA', 'tRNA']

    dbids = {}
    for row in rows:
        dbids[(row[0], row[2])] = row[1]
    files = {}
    taxid_to_dbids = {}
    if args.split_species:
        taxon_file_mapping = {}
        for k, v in dbids.items():
            tname = server.adaptor.execute_and_fetch_col0(
                "SELECT name from taxon_name where taxon_id = %s and name_class = %s",
                (v, 'scientific name'))[0]
            tname = tname.replace(' ', '_')
            if args.output_format == 'gb':
                tname += '.gb'
            elif args.output_format == 'feat-prot':
                tname += '.faa'
            else:
                tname += '.fna'
            files[v] = tname
            taxid_to_dbids.setdefault(v, []).append(k)

    if args.split_species:
        # got to save all of the records before printing them out
        outdata = {}
        for taxid, dbid_list in taxid_to_dbids.items():
            for dbid, dbname in dbid_list:
                db = server[dbname]
                seq_rec = db[dbid]
                outdata.setdefault(taxid, []).append(seq_rec)

        for taxid, dbrecs in outdata.items():
            with open(files[taxid], 'w') as fp:
                if 'feat' in args.output_format:
                    for dbrec in dbrecs:
                        extract_feature(dbrec, args.output_format, fp)
                else:
                    SeqIO.write(dbrecs, fp, args.output_format)

    else:
        if args.output_format == 'feat-prot':
            extract_feature_sql(server,
                                get_seqfeature_ids_for_bioseqs(
                                    server, [x[0] for x in dbids.keys()]),
                                type=types,
                                translate=True)
        elif args.output_format == 'feat-nucl':
            extract_feature_sql(server,
                                get_seqfeature_ids_for_bioseqs(
                                    server, [x[0] for x in dbids.keys()]),
                                type=types)
        else:
            for (dbid, dbname), taxid in dbids.items():
                db = server[dbname]
                try:
                    dbrec = db[dbid]
                    SeqIO.write(dbrec, sys.stdout, args.output_format)
                except KeyError:
                    pass
Exemplo n.º 38
0
def Check_adaptors_against_fasta(readout_fasta, adaptor_site_fasta, ref_fasta, word_size=11, allowed_hits=0,
                                 exclude_names=_adaptor_site_names, check_rc=True,
                                 save=False, save_folder=_readout_folder, save_name=None, save_postfix='_kept',
                                 save_adaptors=False, overwrite=False, verbose=True):
    """Function to check adaptors against a list of fasta files, until get satisfying matches
    Inputs:
    
    Outputs:
        _kept_readouts: list of SeqRecords of which readouts are saved
    """
    ## check inputs
    if verbose:
        print(f"- Check raedouts->adaptors against fasta")
    # readout_Fasta
    if not isinstance(readout_fasta, str):
        raise TypeError(f"Wrong input type of readout_fasta:{readout_fasta}")
    elif not os.path.isfile(readout_fasta):
        raise IOError(
            f"Input file readout_fasta:{readout_fasta} not exist, exit!")
    # adaptor_site_Fasta
    if not isinstance(adaptor_site_fasta, str):
        raise TypeError(
            f"Wrong input type of adaptor_site_fasta:{adaptor_site_fasta}")
    elif not os.path.isfile(adaptor_site_fasta):
        raise IOError(
            f"Input file adaptor_site_fasta:{adaptor_site_fasta} not exist, exit!")
    # ref_fasta
    if isinstance(ref_fasta, str):
        ref_fasta = [ref_fasta]
    if not isinstance(ref_fasta, list):
        raise TypeError(
            f"ref_fasta should be either one filename or list of filenames")
    for _fl in ref_fasta:
        if not os.path.isfile(_fl):
            raise IOError(f"input ref_fasta file:{_fl} not exist, exit.")
    # save etc.
    if save_name is None:
        save_name = os.path.basename(readout_fasta).replace(
            '.fasta', save_postfix+'.fasta')
        save_filename = os.path.join(save_folder, save_name)
    # load readouts
    with open(readout_fasta, 'r') as _handle:
        readouts = []
        for _record in SeqIO.parse(_handle, "fasta"):
            readouts.append(_record)
        if verbose:
            print(f"-- {len(readouts)} readout loaded")
    # initialize adaptor selection flags
    _adaptor_flags = []
    while(len(_adaptor_flags) != len(readouts)):
        # generate current adaptors
        _adaptors = Generate_adaptors(readouts, adaptor_site_fasta)
        # update whether keep the adaptor
        _adaptor_flags = np.ones(len(_adaptors), dtype=np.bool)
        for _fl in ref_fasta:
            _, _fl_kept = Screen_seqs_against_fasta(_adaptors, _fl, word_size=word_size,
                                                    allowed_hits=allowed_hits,
                                                    return_kept_flag=True, verbose=False)
            _adaptor_flags *= _fl_kept

        readouts = [_r for _r, _f in zip(readouts, _adaptor_flags) if _f]
    if verbose:
        print(f"-- {len(readouts)} readous are kept.")
    if save:
        with open(save_filename, 'w') as _output_handle:
            if verbose:
                print(f"-- saving filtered readouts to file: {save_filename}")
            SeqIO.write(readouts, _output_handle, "fasta")

    if save_adaptors:
        _adaptors = Generate_adaptors(readouts, adaptor_site_fasta)
        adaptor_save_filename = save_filename.replace(
            '.fasta', '_adaptor.fasta')
        if verbose:
            print(
                f"-- saving corresponding adaptors to file: {adaptor_save_filename}")
        with open(adaptor_save_filename, 'w') as _output_handle:
            SeqIO.write(readouts, _output_handle, "fasta")

    return readouts
Exemplo n.º 39
0
                continue
            ctg1 = sline[0].split("_")[0].strip("+").strip("-")
            ctg2 = sline[0].split("_")[1].strip("+").strip("-")
            ori1 = sline[0].split("_")[0][0]
            ori2 = sline[0].split("_")[1][0]
            distance = float(sline[1])
            dist = int(distance)

if args.blacklistfile:
    blacklist = {}
    with open(args.blacklistfile) as f:
        for line in f:
            sline = line.split()
            blacklist[sline[0]] = sline[1]

for read in SeqIO.parse(args.contigfile, "fasta"):
    contigs[read.id] = len(read.seq)

print("Nr. of scaffolds: " + str(len(contigs)))


class Node:
    segments = {}  # id1: (lc1, rc1)
    sizes_nc = [
    ]  # sizes of contigs that determine the average (that were Not Cut)

    def get_right_nb(self):
        pass

    def get_distance_table(self):
        pass
Exemplo n.º 40
0
def Search_Candidates(source_readout_file, total_cand=200, existing_readout_file='cand_readouts.fasta',
                      readout_folder=_readout_folder, GC_percent=[0.4,0.6], max_consecutive=4, 
                      max_rep=6, C_percent=[0.2, 0.28], blast_hsp_thres=10.0, 
                      save_name='selected_candidates.fasta', verbose=True):
    """Function to search readout sequences in a given pool compared with existing readouts
    Inputs:
        source_readout_file: filename for readout sequence pool, string (should be .fasta)
        total_cand: number of candidates we hope to generate, int (default: 1000)
        existing_readout_file: filename for existing readouts, string (should be ,fasta)
        readout_folder: folder to store readout information, string (default: globally given)
        GC_percent: whether check gc content, list of two values or False ([0.4, 0.6])
        max_consecutive: maximum allowed consecutive bases, int (4)
        max_rep: maximum replicated sequence in a readout allowed, int (6)
        C_percent: percentage of base C, list of two values or False([0.22, 0.28])
        blast_hsp_thres: threshold for blast hsp, no hsp larger than this allowed, int (10)
        blast_ref: file basename for fasta file of existing readouts in readout_folder, used for blast
        verbose: say something!, bool (default: True)
    Outputs:
        _cand_readouts: list of Bio.SeqRecord.SeqRecord objects
    """
    ## check input files
    if not os.path.isfile(source_readout_file):
        source_readout_file = os.path.join(readout_folder, source_readout_file)
        if not os.path.isfile(source_readout_file):
            raise IOError(f"Wrong input source readout file:{source_readout_file}, not exist.")
    elif '.fasta' not in source_readout_file:
        raise IOError(f"Wrong input file type for {source_readout_file}")
    if not os.path.isfile(existing_readout_file):
        existing_readout_file = os.path.join(readout_folder, existing_readout_file)
        if not os.path.isfile(existing_readout_file):
            raise IOError(f"Wrong input source readout file:{existing_readout_file}, not exist.")
    elif '.fasta' not in existing_readout_file:
        raise IOError(f"Wrong input file type for {existing_readout_file}")
    # load candidate sequences and filter
    
    # start looping
    if verbose:
        print(f"- Start selecting readout candidates from {source_readout_file},\n\tfiltering with {existing_readout_file} ")
    _cand_records = []
    _ct = 0
    with open(source_readout_file, "rU") as _handle:
        for _record in SeqIO.parse(_handle, "fasta"):
            if len(_cand_records) >= total_cand:
                if verbose:
                    print(f"-- {total_cand} new candidates acquired, stop iteration.")
                break
            
            if verbose:
                print (f"--- processing: {_record.seq}")
                for i in range(32):
                    _new_seq = Extend_Readout(_record.seq)
                    _keep = Filter_Readout(_new_seq,GC_percent=GC_percent, 
                                            max_consecutive=max_consecutive,
                                            max_rep=max_rep, C_percent=C_percent,
                                            blast_hsp_thres=blast_hsp_thres,
                                            readout_folder=readout_folder,
                                            blast_ref=os.path.basename(existing_readout_file),
                                            verbose=False)
                    if _keep:
                        _kept_record = SeqRecord(_new_seq, id='cand_'+str(_ct+1), description='30mer_candidate')
                        _cand_records.append(_kept_record)
                        if verbose:
                            print (f"--- candidate:{_ct} {_new_seq} saved")
                        # Save to candidate records
                        with open(existing_readout_file, "a") as _output_handle:
                            SeqIO.write(_kept_record, _output_handle, "fasta")
                        _ct += 1
                        break
            else:
                break
    # after selection, save selected_candidates
    _save_filename = os.path.join(readout_folder, save_name)
    with open(_save_filename, 'w') as _output_handle:
        if verbose:
            print(f"-- saving candidate readouts into file: {_save_filename}")
        SeqIO.write(_cand_records, _output_handle, "fasta")
    
    return _cand_records
Exemplo n.º 41
0
from Bio import SeqIO

records = list(SeqIO.parse("sequence.fasta", "fasta"))

seq = str(records[0].seq[0:])

seqOcur = {}

i = 0
j = 37
seqLen = len(seq)

# Build a dictionary with key meaning the sequence and value meaning the amount of times that it appears
while j <= seqLen:

    if (seq[i:j] in seqOcur):
        seqOcur[seq[i:j]] += 1
    else:
        seqOcur[seq[i:j]] = 1

    i += 1
    j += 1

# Print the first 100 different sequences with their respective count
print(len(seqOcur), "different subsequences of size 37 were found. \n\n")
limit = 1
print("The first 100 sequences with their respective count:")
for key, value in seqOcur.items():
    if limit == 101:
        break
    print("Sequence:", key, "Count:", value)
Exemplo n.º 42
0
elif output_entry != 'X':
    output_figure = output_entry + '_output_figure'
    output_sequence = output_entry + '_output_clean.fasta'
    output_rejected = output_entry + '_output_rejected.fasta'
    output_tabular = output_entry + '_output_analysis.csv'
    output_full_table = output_entry + '_full_comparison_table.csv'

# Start timer

start_time = time.time()

# Initialize

alignment_record_name_list = []

for record in SeqIO.parse(input_sequence, "fasta"):
    alignment_record_name_list.append(record.name)

depth_of_alignment = (len(alignment_record_name_list))

record_sequence_trial_results = pd.DataFrame(alignment_record_name_list,
                                             columns=['Accession'])

if number_in_small_test == 0:
    number_in_small_test = depth_of_alignment

if number_in_small_test == depth_of_alignment:
    min_trials_for_each_sequence = 1

print("Analyzing '" + input_sequence + "'.")
print('Flags are --IQR_coefficient: ' +
Exemplo n.º 43
0
import gzip
import sys
from Bio import SeqIO

fasta = gzip.open(sys.argv[1], 'rU')
out_file = sys.argv[2]

o = open(out_file, 'w')

for i in SeqIO.parse(fasta, 'fasta'):
    name = i.id
    o.write(str(name) + '\n')

fasta.close()
o.close()

#usage example: python get_ids.py calJac3.fa.masked.gz calJac3.fa.ids.txt
Exemplo n.º 44
0
    np.save(outlabel, train_label)


if __name__ == '__main__':

    infile = sys.argv[1]
    itr1 = int(sys.argv[2])
    outpath = sys.argv[3]

    if outpath[-1] == "/":
        outpath = outpath[:-1]

    ###### read fasta file #######
    dataset = []
    out = ""
    for record in SeqIO.parse(infile, "fasta"):
        id_part = record.id
        id_parts = id_part.split(",")
        seq_part = str(record.seq.upper())

        # geneset : [[gene name, genelabel(mi=0,sno=1,t=2), sequence],...
        dataset = dataset + [[id_parts[0], int(id_parts[1]), seq_part]]

        out += id_parts[0] + ":" + id_parts[1] + "\n"
    f = open(outpath + "/genelabel.txt", "w")
    f.write(out)
    f.close()

    ##############################

    make_pairFASTA(dataset, itr1, outpath)
Exemplo n.º 45
0
#Python code to check for number of reverse compliments in fasta file

from Bio import SeqIO
from Bio.Seq import Seq

seq = []
cnt = 0
response = 0


def reverse_comp(nucleotide):
    flag = int(0)
    revcomp = nucleotide[::-1].complement()
    if (revcomp == nucleotide):
        flag = int(1)

    return flag


for record in SeqIO.parse('data/rosalind_rvco.txt', 'fasta'):
    seq.append(record.seq)

for i in range(0, len(seq)):
    response = reverse_comp(seq[i])
    if (response == int(1)):
        cnt += 1
    else:
        continue

print(cnt)
Exemplo n.º 46
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# required module imports
import httplib2, urllib, json, sys, StringIO
from Bio import SeqIO

# setup http object
http = httplib2.Http(".cache")

iterations=1
if len(sys.argv) == 2 :
  iterations = int(sys.argv[1])

for x in range(0,iterations) :
  print str(x)
  #Get gene genomic seq
  resp, content = http.request("http://127.0.0.1:3000/sequence/id/ENSG00000139618.fasta", method="GET", headers={"Content-Type":"text/plain"})

  # check response ok
  if not resp.status == 200:
	  print "Invalid response: ", resp.status
	  sys.exit()

  io = StringIO.StringIO(content)
  for record in SeqIO.parse(io, "fasta") :
    print "definition :" + record.id
    print "nalen      :" + str(len(record.seq))
Exemplo n.º 47
0
from Bio import SeqIO

input_filename = 'source_downloads/hgb_ref_HetGla_female_1.0_chrUn.fa'

for seq in SeqIO.parse(open(input_filename), 'fasta'):
    split_desc = seq.description.split('|')
    print '>{} {}\n{}'.format(split_desc[3], seq.description, str(seq.seq))
Exemplo n.º 48
0
def retrieve_seq(acc_num):
    gb_file = "{}/{}.gb".format(acc_num, acc_num)
    entry = SeqIO.read(open(gb_file, "r"), "genbank")
    return entry
Exemplo n.º 49
0
def produce_glocks_method(fn_fasta, fn_assays_list):
	''' USED TO EFFICIENTLY SELECT GBLOCKS, GIVEN A FASTA FILE OF FULL LENGTH SEQUENCE AND A PRIMER FILE OF THE EXPECTED FORMAT LIKE THIS
	>ACF24861.1_1091_spec_F_1344	TGGCAGGCGGATAAATTCTT	>ACF24861.1_1091_spec_R_1437	CGGCACTGTCAAACCCATAA
	Returns:
		A gblock fasta file
	'''
	import sys
	from Bio import SeqIO
	handle = open(fn_fasta, "rU")
	record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
	handle.close()
	
	# MAKE
	fh = open(fn_assays_list, 'r')
	L1 = []
	GL = []
	Ref_L = list()
	D = dict()
	
	
	import re
	for line in fh:
		line = line.strip().split('\t')
		m0 = re.match('>([a-zA-Z0-9]+\.[0-9]+)', line[0])
		ref_seq_temp = m0.group(1)
		if ref_seq_temp not in D.keys():
			D[ref_seq_temp] = list()
			Ref_L.append(ref_seq_temp)
		#print line[0]
		m1 = re.match('>[a-zA-Z0-9]+\.[0-9]+_.*F_([0-9]+)', line[0]) #>AAC60788.1_1_spec_F_1373	
		#print line[2]
		m2 = re.match('>[a-zA-Z0-9]+\.[0-9]+_.*R_([0-9]+)', line[2]) #>AAC60788.1_1_spec_R_1457
		#print str(m1.group(1)) + "\t" + str(m2.group(1))
		D[ref_seq_temp].append([int(m1.group(1)), int(m2.group(1))])
		
		
	oh = open(fn_assays_list + ".gblocks", 'w')
	
	
	for r in Ref_L:
		print "#######%s#######" %(r)
		L1 = list(D[r])	
		print L1
		print
		captured_list = list()
		best_count = 0
		store_best = [0,0]
		for i in range(1800):
			gblock_start = i
			gblock_end = i + 490
			captured_count = 0
			for k in range(len(L1)):
				if int(L1[k][0]) - 20 > gblock_start and int(L1[k][1]) + 20 < gblock_end:
					captured_count = captured_count + 1 
			if captured_count > best_count:
				#print captured_count
				best_count = int(captured_count)
				store_best[0] = gblock_start
				store_best[1] = gblock_end
		L2 = list()
		for j in range(len(L1)):
			if int(L1[j][0]) < store_best[0] or int(L1[j][1]) > store_best[1]:
				L2.append(L1[j])
				
		print store_best
		seq = str(record_dict[r].seq)
		sub_seq = seq[store_best[0]:store_best[1]] # TAKE THE INTEVAL
		oh.write('>' + record_dict[r].description +  "_gBlock1_" + str(store_best[0]) + "_" + str(store_best[1]) + "_captures_" + str(len(L1) - len(L2)) + "\n" )
		oh.write(sub_seq + "\n")
		print str(len(L1) - len(L2)) +  " SEQUENCES WERE CAPTURED"
		
		if len(L2) > 0: # YOU FAILED TO CAPTURE ALL THE SEQUENCES IN ONE GBLOCK 
			captured_list = list()
			best_count = 0
			store_best = [0,0]
			for i in range(1800):
				gblock_start = i
				gblock_end = i + 490
				captured_count = 0
				for k in range(len(L2)):
					if int(L2[k][0]) - 20 > gblock_start and int(L2[k][1]) + 20 < gblock_end:
						captured_count = captured_count + 1 
				if captured_count > best_count:
					#print captured_count
					best_count = int(captured_count)
					store_best[0] = gblock_start
					store_best[1] = gblock_end
					
			L3 = list()
			for j in range(len(L2)):
				if int(L2[j][0]) < store_best[0] or int(L2[j][1]) > store_best[1]:
					L3.append(L2[j])
					
			print store_best
			print str(len(L2) - len(L3)) +  " SEQUENCES WERE CAPTURED"
			seq = str(record_dict[r].seq)
			sub_seq = seq[store_best[0]:store_best[1]] # TAKE THE INTEVAL
			oh.write('>' + record_dict[r].description +  "_gBlock2_" + str(store_best[0]) + "_" + str(store_best[1]) + "_captures_" + str(len(L2) - len(L3)) + "\n" )
			oh.write(sub_seq + "\n")
			
			
			
			if len(L3) > 0: # YOU FAILED TO CAPTURE ALL THE SEQUENCES IN ONE GBLOCK 
				captured_list = list()
				best_count = 0
				store_best = [0,0]
				for i in range(1800):
					gblock_start = i
					gblock_end = i + 490
					captured_count = 0
					for k in range(len(L3)):
						if int(L3[k][0]) - 20 > gblock_start and int(L3[k][1]) + 20 < gblock_end:
							captured_count = captured_count + 1 
					if captured_count > best_count:
						#print captured_count
						best_count = int(captured_count)
						store_best[0] = gblock_start
						store_best[1] = gblock_end
					
				non_captured_list = list()
				for j in range(len(L3)):
					if int(L3[j][0]) < store_best[0] or int(L3[j][1]) > store_best[1]:
						non_captured_list.append(L3[j])
			
				print store_best
				print str(len(L3) - len(non_captured_list)) +  " SEQUENCES WERE CAPTURED"
				seq = str(record_dict[r].seq)
				sub_seq = seq[store_best[0]:store_best[1]] # TAKE THE INTEVAL
				oh.write('>' + record_dict[r].description +  "_gBlock3_" + str(store_best[0]) + "_" + str(store_best[1]) + "_captures_" + str(len(L3) - len(non_captured_list)) + "\n" )
				oh.write(sub_seq + "\n")
Exemplo n.º 50
0
import sys
from Bio import SeqIO
import glob
import os.path
from collections import defaultdict

lookup = {}

for rec in SeqIO.parse(open(sys.argv[1]), "fasta"):
    lookup[rec.id] = rec.description

results = defaultdict(list)

for sample in sys.argv[2:]:
    for fn in glob.glob(sample + '/clusters/*.cluster.fasta'):
        refid = os.path.basename(fn)

        hits = len(list(SeqIO.parse(open(fn), 'fasta')))

        tofind = refid.replace('.cluster.fasta', '')

        results[sample].append((hits, lookup[tofind]))

for sample, hits in results.iteritems():
    total_hits = sum([h[0] for h in hits])

    for h in hits:
        taxonomy = h[1].split(" ")

        print "%s\t%s\t%s %s\t%s\t%s\t%s" % (sample, taxonomy[1], taxonomy[1],
                                             taxonomy[2], h[1], h[0],
Exemplo n.º 51
0
    quiverStr = Tools.ExtractSeq((quiverName, quiverStart, quiverEnd), tFile, tFai)
    
    print "length of : " + targetName + " " + str(len(genomeStr))
    quiverName = targetName + " quiver"
    print "length of : " + quiverName + " " + str(len(genomeStr))
#    quiverStr = targetDict[targetName][tRegionStart:tRegionEnd]
    
    genomeFileName = tempfile.mktemp(suffix=".fasta", dir=".")
    quiverFileName = tempfile.mktemp(suffix=".fasta", dir=".")
    tmpFileNames.append(genomeFileName)
    tmpFileNames.append(quiverFileName)
    genomeFile = open(genomeFileName, 'w')
    quiverFile = open(quiverFileName, 'w')
    genomeName = "{}:{}-{}".format(chrom,start,end)
    quiverName = "{}:{}-{} quiver".format(chrom,start,end)
    SeqIO.write(SeqRecord.SeqRecord(seq=Seq.Seq(genomeStr), id=genomeName,name="",description=""), genomeFile, "fasta")
    SeqIO.write(SeqRecord.SeqRecord(seq=Seq.Seq(quiverStr), id=quiverName,name="",description=""), quiverFile, "fasta")
    genomeFile.close()
    quiverFile.close()
    command="/net/eichler/vol5/home/mchaisso/projects/PacBioSequencing/scripts/DotPlot.py --query {} --target {} --savefig {}_{}_{}.pdf --matches dot:11".format(quiverFileName, genomeFileName, chrom,start,end)
    print "running " + command
    subprocess.call(command.split())
    for tmpfile in tmpFileNames:
        command = "/bin/rm -f " + tmpfile
        subprocess.call(command.split())
                
    
    
#chr10   102356490       102357412       chr10:100000000-110000000|quiver_2300000_2350000/0_50000        interior

Exemplo n.º 52
0
def main():
    usage = "\n%prog  [options]\nNeeded software: Biopython, BLAST (2.3.0 for reproducibility), bedtools (2.28 for reproducibility)"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-f",
                      "--fasta",
                      action="store",
                      dest="fasta",
                      help="Transcript file in FASTA format (required).")
    parser.add_option("-g",
                      "--gtf",
                      action="store",
                      dest="gtf",
                      help="Transcript coordinates in GTF format (required).")
    parser.add_option("-d",
                      "--db",
                      action="store",
                      dest="db",
                      help="FASTA database for species comparison (required).")
    parser.add_option("-o",
                      "--out",
                      action="store",
                      dest="out",
                      help="Output unique name (required).")
    parser.add_option("-m",
                      "--mask",
                      action="store",
                      dest="mask",
                      default="none",
                      help="GTF file for masking (f.e. pseudogenes).")
    parser.add_option(
        "-O",
        "--orthologs",
        action="store",
        dest="ort",
        default="none",
        help="Additional list of known gene orthologs (f.e. Ensembl Compara).")

    blast_path = 'blastn'
    bedtools_path = 'bedtools'

    (opt, args) = parser.parse_args()

    check_arg(opt.fasta, "--fasta")
    check_arg(opt.gtf, "--gtf")
    check_arg(opt.db, "--db")
    check_arg(opt.out, "--out")
    check_file(opt.fasta)
    check_file(opt.gtf)

    if not os.path.exists('out'):
        os.makedirs('out')
    if not os.path.exists('tmp'):
        os.makedirs('tmp')

    seqs = SeqIO.index(opt.fasta, "fasta")  #BLAST 2.3.0
    exc = exclude(seqs)

    run_blast(opt.fasta, opt.db, blast_path, '1', opt.out)

    print('Parsing')
    regions = parse_blast("tmp/blast_" + opt.out + ".xml", exc)
    coords_t = parse_gtf(opt.gtf, opt.out)

    print('Writing main regions')
    write_main_regions(regions, opt.gtf, coords_t, exc, opt.out, opt.mask,
                       opt.ort, bedtools_path)  #BEDtools 2.28

    print('Clustering regions from same genes')
    clusters = cluster_regions2(opt.out)

    print('Writing final output')
    biotype = getBiotypes(opt.gtf)
    write_final_regions(opt.out, clusters, biotype)

    r = glob.glob('tmp/*')
    for i in r:
        if i.startswith('tmp/' + opt.out):
            os.remove(i)
Exemplo n.º 53
0
from Bio import SeqIO
import argparse

parser = argparse.ArgumentParser(
    description='Process some fasta file to return the stats on the matches at some e-value')
parser.add_argument('-i', '--input', type=str, help="This in the fasta file to get the stats on")
parser.add_argument('-r', '--reference', type=str, help="This is the list of files to compare the stats on")

args = parser.parse_args()
input_file = args.input
ref_file = args.reference

# Parse and count the reads from each of the genomes
dict = {}
fasta_sequences = SeqIO.parse(open(input_file), 'fasta')
for fasta in fasta_sequences:
    name, sequence = fasta.id, str(fasta.seq)
    split = name.split('__')
    if split[0] in dict.keys():
        dict[split[0]] += 1
    else:
        dict[split[0]] = 1

# Construct a list of them all, and print those that have more then 1
hit_list = []
multi_hits = []
print("+++ MULTI MATCHES +++")
for item in dict:
    hit_list.append(item)
    if dict[item] > 1:
        multi_hits.append(item)
Exemplo n.º 54
0
                        continue
                    else:
                        names.append(newname)
                        break
                else:
                    names.append(name)
                    break
        pathstofiles = []
        path = tempfile.mkdtemp(dir=self.tmpdir)

        for name, seq in zip(names, seqs):
            whole_path = os.path.join(path, name)+".gb"
            seq.write(whole_path)
            pathstofiles.append('"{}"'.format(whole_path))

        p = subprocess.Popen("{} {}".format(self.path_to_editor," ".join(pathstofiles)),
                             shell=True,
                             stdout = tempfile.TemporaryFile(),
                             stderr = tempfile.TemporaryFile()).pid
        time.sleep(0.5)
        #shutil.rmtree(path)
        #for name in names:
        #    print os.path.join(path, name)+".gb"

if __name__=="__main__":
    from Bio import SeqIO
    sr1 = SeqIO.parse("../tests/pUC19.gb","gb").next()
    sr2 = SeqIO.parse("../tests/pCAPs.gb","gb").next()
    aperunner = Ape("tclsh /home/bjorn/.ApE/apeextractor/ApE.vfs/lib/app-AppMain/AppMain.tcl")
    aperunner.open(sr1,sr2)