Exemplo n.º 1
0
def fix_mate_pairs(fq1, fq2, f_suffix="/1", r_suffix="/2"):
    """
    takes two FASTQ files (fq1 and fq2) of paired end sequencing data
    and filters out reads without a mate pair.
    """
    fq1_out = append_stem(fq1, "fixed")
    fq2_out = append_stem(fq2, "fixed")
    fq1_single = append_stem(fq1, "singles")
    fq2_single = append_stem(fq2, "singles")

    if all(map(file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])):
        return [fq1_out, fq2_out]

    f_dict = SeqIO.index(fq1, "fastq",
                         key_function=get_read_name_function(f_suffix))
    r_dict = SeqIO.index(fq2, "fastq",
                         key_function=get_read_name_function(r_suffix))

    with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle:
        for key in f_dict:
            if key in r_dict:
                fq1_out_handle.write(f_dict.get_raw(key))
                fq2_out_handle.write(r_dict.get_raw(key))
            else:
                fq1_single_handle.write(f_dict.get_raw(key))
        for key in r_dict:
            if key not in f_dict:
                fq2_single_handle.write(r_dict.get_raw(key))

    return [fq1_out, fq2_out]
Exemplo n.º 2
0
def parse_predicted_CDS_file(cds_file):
    """parse the cds file and index it.
    Take in the cds file and uses biopython to index it"""
    # this is for transdecoder names. May need to alter for other tools
    try:
        cds_database = SeqIO.index(cds_file,
                                   "fasta",
                                   key_function=strip_to_match_transcript_name)
        return cds_database
    except ValueError:
        outstr = ("WARNING: multi cds were predicted per transcript \n" +
        "\t- cannot change names. Going to pick the longest representative \n" +
        "\tcds per transcripts. Only do this if there are multiple cds \n" +
        "\tpredicted per transcript, otherwise message is not shown\n")
        logger.info(outstr)
        cds_database = SeqIO.index(cds_file, "fasta")
        # basically there are duplicates for each transcript.
        # So, find the longest representative and
        #  create a new cds_database, based on that
    # call function
    longest_rep = os.path.join("temp_fix_five_prime",
                               "longest_representative_seq.fasta")
    cds_database_new = find_longest_components(cds_file,
                                               cds_database,
                                               longest_rep)
    # return a seq_record object that can be
    # accessed in a dictionary like manner
    return cds_database_new
Exemplo n.º 3
0
def extract_nbs(fi, fo, fd_cds, fd_pro, fo_pro, fn_cds, fn_pro):
    cds_dict = SeqIO.index(fd_cds, "fasta")
    pro_dict = SeqIO.index(fd_pro, "fasta")
    fhi = open(fi, "r")
    fho = open(fo, "w")
    header = fhi.readline().strip("\n")
    print >> fho, header
    (nbs_cds_rcds, nbs_pro_rcds, pro_rcds) = ([], [], [])
    for line in fhi:
        line = line.strip("\n")
        gid, size, doms, e, domstr, tag, beg, end = line.split("\t")
        if tag == "":
            continue
        beg, end = int(beg), int(end)
        cds = str(cds_dict[gid].seq[(beg - 1) * 3 : end * 3])
        pro = str(pro_dict[gid].seq[(beg - 1) : end])
        cds_rcd = SeqRecord(Seq(cds), id=gid, description="")
        pro_rcd = SeqRecord(Seq(pro), id=gid, description="")
        nbs_cds_rcds.append(cds_rcd)
        nbs_pro_rcds.append(pro_rcd)
        pro_rcds.append(pro_dict[gid])
        print >> fho, line
    fho.close()
    SeqIO.write(nbs_cds_rcds, fn_cds, "fasta")
    SeqIO.write(nbs_pro_rcds, fn_pro, "fasta")
    SeqIO.write(pro_rcds, fo_pro, "fasta")
Exemplo n.º 4
0
    def __init__(self, ref_fpath, orf_seq_fpath):
        self.orf_suffix = '_orf_seq'
        self.ref_index = SeqIO.index(ref_fpath, 'fasta')
        self.orf_seq_index = SeqIO.index(orf_seq_fpath, 'fasta')
        blossum_path = join(DATA_DIR, 'blossum90.csv')
        self.blosum = _parse_blossum_matrix(blossum_path)

        self.conf = {}
def seq_getter(transcripome, proteins, cds, blast, trinotate, names_file, Output_prefix):
    """function to get the nt_seq/pep of genes of interest (names)- then opens up the
    annotation databse file and get the corresponsing protein seq which was predicted by
    transdecoder."""

    wanted = wanted_genes(names_file)
    ##########################################################################################                
    #   This is the trinnotate sql database in tab format
    #COLOUM 0 IS THE GENE/ COMPONENT ID
    #COLOUMN 1 IS THE TRANSCRIPT ID
    #COLOUMN 5 IS THE PROTEIN ID
    #gene_id	transcript_id	sprot_Top_BLASTX_hit	TrEMBL_Top_BLASTX_hit	RNAMMER	prot_id	prot_coords	sprot_Top_BLASTP_hit	TrEMBL_Top_BLASTP_hit	Pfam	SignalP	TmHMM	eggnog	gene_ontology_blast	gene_ontology_pfam	transcript	peptide
    ##########################################################################################
    
    transcript_info_dict = tabular_file_to_info(trinotate)
    transcript_to_protein_dict = transcript_to_protein(trinotate)
    top_blast_hit_dict = top_blast_hit_database(blast)
                             
    #nucleotide seq out
    nucleotide_out_file = Output_prefix+".nt.fasta"
    protein_out_file = Output_prefix+"_cds.pep.fasta"
    cds_nt_file_out = Output_prefix+"_cds.nt.fasta"
    #nt out
    f_nt_out = open(nucleotide_out_file, 'w')
    #protein file out
    f_PROTEIN_out = open(protein_out_file, 'w')
    #cds_file out
    f_cds_out = open(cds_nt_file_out, 'w')
    #index the fasta files
    transcriptome_record_db = SeqIO.index(transcripome, "fasta")
    cds_record_db = SeqIO.index(cds, "fasta")
    pep_record_db = SeqIO.index(proteins, "fasta")

    for i in wanted:
        if i in transcriptome_record_db:
            record = transcriptome_record_db[i]
            SeqIO.write(record, f_nt_out, "fasta")
            transdecoder_protein = transcript_to_protein_dict[i]
            
            if transdecoder_protein != ".":
                peprecord = pep_record_db[transdecoder_protein]
                try:
                    peprecord.description = top_blast_hit_dict[transdecoder_protein]+"\t"+transcript_info_dict[i]
                except KeyError:
                    # Join the three fields with spaces into one string:
                    peprecord.description = transcript_info_dict[i]#" ".join(transcript_info_dict[i])
                SeqIO.write(peprecord, f_PROTEIN_out, "fasta")
                
                cds_of_interest = cds_record_db[transdecoder_protein]
                cds_of_interest.description = peprecord.description

                # TODO - fill in descr
                SeqIO.write(cds_of_interest, f_cds_out, "fasta")

    f_nt_out.close()
    f_cds_out.close()
    f_PROTEIN_out.close()
    return True
def seq_getter(blast_file,
               cds_file,
               protein_file,
               known_seq_db,
               known_name_list,
               folder_name):
    """Function to convert a top blast anlaysis query versus seq
    to a file containing these sequences."""
    print("current cds_file is false")
    cds_file = False
    if cds_file:
        # if we have a nt file... the creat files associated
        # with what is in the file. + get a filename dict
        nhandles = generate_dict_of_files(known_name_list,
                                          folder_name,
                                          "cds")
    if protein_file:
        # if we have a AA file... the creat files associated
        # with what is in the file. + get a filename dict
        phandles = generate_dict_of_files(known_name_list,
                                          folder_name,
                                          "pep")
    if protein_file:
        # index
        protein_sequences = SeqIO.index(protein_file, "fasta")
    if cds_file:
        nucleotide_sequences =  SeqIO.index(cds_file, "fasta")
    print("Starting output...")
    names_already_printed = set([])
    blast_hits_wanted = open_blast_file(blast_file)
    for line in blast_hits_wanted:
        if not test_line(line):
            continue
        gene, blast_hit_matches = parse_blast_tab_hit(line)
        if cds_file:
            # get the nt seq from the gene models file
            seq_record = nucleotide_sequences[blast_hit_matches]
            SeqIO.write(seq_record, nhandles[gene], "fasta")
        if protein_file:
            # get the AA seq from the gene models file
            seq_record = protein_sequences[blast_hit_matches]
            SeqIO.write(seq_record, phandles[gene], "fasta")
        if not gene in names_already_printed:
            # get the seq from the known db
            seq_record = known_seq_db[gene]
            SeqIO.write(seq_record, phandles[gene], "fasta")
            names_already_printed.add(gene)
    # loop to close all the open files. There could be many!!
    if cds_file:
        for gene in known_name_list:
            nhandles[gene].close()
    for gene in known_name_list:
        phandles[gene].close()
Exemplo n.º 7
0
 def simple_check(self, filename, format, alphabet):
     if format in SeqIO._BinaryFormats:
         mode = "rb"
     else :
         mode = "r"
     id_list = [rec.id for rec in \
                SeqIO.parse(open(filename, mode), format, alphabet)]
     #Without key_function
     rec_dict = SeqIO.index(filename, format, alphabet)
     self.check_dict_methods(rec_dict, id_list, id_list)
     #Check with key_function
     key_list = [add_prefix(id) for id in id_list]
     rec_dict = SeqIO.index(filename, format, alphabet, add_prefix)
     self.check_dict_methods(rec_dict, key_list, id_list)
def FastaGeneIDExtract(inffile, ids):
    outFile = open("temp.fas", "w")
    records_all = SeqIO.index(inffile, "fasta")
    heads = []
    for id in ids:
        heads.append(id)
    count = 1
    for head in heads:
        outFile.write(str(">" + head + "\n" + records_all[head].seq + "\n"))
        count = count + 1
    outFile.close()
    print "Filtered fasta sequences extracted: ", count
    records_all = SeqIO.index("temp.fas", "fasta")
    os.system("rm temp.fas -f ")
    return records_all
    def get_raw_check(self, filename, format, alphabet):
        if format in SeqIO._BinaryFormats:
            #This means SFF at the moment, which does not get
            #implement the get_raw method
            return
        handle = open(filename, "rU")
        raw_file = handle.read()
        handle.close()
        #Also checking the key_function here
        id_list = [rec.id.lower() for rec in \
                   SeqIO.parse(filename, format, alphabet)]
        rec_dict = SeqIO.index(filename, format, alphabet,
                               key_function = lambda x : x.lower())
        self.assertEqual(set(id_list), set(rec_dict.keys()))
        self.assertEqual(len(id_list), len(rec_dict))
        for key in id_list:
            self.assert_(key in rec_dict)
            self.assertEqual(key, rec_dict[key].id.lower())
            self.assertEqual(key, rec_dict.get(key).id.lower())
            raw = rec_dict.get_raw(key)
            self.assert_(raw.strip())
            self.assert_(raw in raw_file)
            if format in ["ig"]:
               #These have a header structure and can't be parsed
               #individually (at least, not right now).
               continue
            rec1 = rec_dict[key]
            rec2 = SeqIO.read(StringIO(raw), format, alphabet)
	    self.assertEqual(True, compare_record(rec1, rec2))
Exemplo n.º 10
0
    def compute_scores(self):

        def score(record):
            seq = record.seq
            fracHelix = self.frac(seq, r'[HGI]')
            fracSheet = self.frac(seq, r'[BE]')
            fracBend = self.frac(seq, r'[STL]')
            fracLoop = self.frac(seq, r'C')
            return (fracHelix, fracSheet, fracBend, fracLoop)


        dtype=[('PrimaryID', '|S20'),
               ('fracHelix', float),
               ('fracSheet', float),
               ('fracBend', float),
               ('fracLoop', float),
        ]

        fastaPath = self.rawInputFilePath
        recordDict = SeqIO.index(fastaPath, "fasta")
        proteinIds = set([])
        allScores = []
        for key in recordDict.keys():
            match = re.search(r'(\w+)\|.*dssp', key)
            if match:
                pid = match.group(1)
                if pid not in proteinIds:
                    seqRec = recordDict.get(key)
                    allScores.append((pid,) + score(seqRec))
                    proteinIds.add(pid)
        
        allScores = np.array(allScores, dtype=dtype)
        return allScores        
Exemplo n.º 11
0
    def reduce(self, long_percent=10, merged_path=None, output_path=None):
        """Selects the longest 10% proteins from the merged fasta file

        Parameters
        ----------
        long_percent : float, optional
            Determines the percentage of long proteins to be used for creating protein families
        merged_path : basestring, optional
            Path to merged faa file. If None, the path used by self.merge_genomes_files is used.
        output_path : basestring, optional
            Output path. If None, saves 'reduced.faa' in self.output_dir.

        """
        if output_path is None:
            output_path = os.path.join(self._output_dir, 'reduced.faa')
        if merged_path is None:
            if not hasattr(self, 'merged_path_'):
                raise ValueError('No merged fasta file')
            merged_path = self.merged_path_
        lens_and_ids = sorted([(len(rec), rec.id) for rec in SeqIO.parse(merged_path, 'fasta')], reverse=True)
        ids = [id for (length, id) in lens_and_ids]
        del lens_and_ids
        ids = ids[: len(ids) // long_percent]
        rec_index = SeqIO.index(merged_path, 'fasta')
        with open(output_path, 'wb') as out_file:
            for id in ids:
                SeqIO.write(rec_index[id], out_file, 'fasta')
        print 'Saving reduced proteins as {}'.format(output_path)
        self.reduced_path_ = os.path.abspath(output_path)
Exemplo n.º 12
0
def fetch_fasta_db(
        table_name,
        download_url,
        fasta_filename=None,
        key_column='id',
        value_column='seq',
        subdir=None,
        version=1):
    """
    Download a FASTA file from `download_url` and store it locally as a sqlite3 database.
    """

    base_filename = normalize_filename(split(download_url)[1])
    db_filename = "%s.%s.%s.db" % (base_filename, key_column, value_column)

    fasta_path = fetch_file(
        download_url=download_url,
        filename=fasta_filename,
        subdir=subdir,
        decompress=True)

    fasta_dict = SeqIO.index(fasta_path, 'fasta')

    table = DatabaseTable.from_fasta_dict(
        table_name,
        fasta_dict,
        key_column=key_column,
        value_column=value_column)

    db_path = build_path(db_filename, subdir)

    return _create_cached_db(
        db_path,
        tables=[table],
        version=version)
Exemplo n.º 13
0
def find_longest_components(filename1,
                            cds_database,
                            out_filename):
    """this is a function to open up a fasta file, and
    producea a list of the longest representative transcripts per gene.
    This is only called is there are duplicated found with the same
    prefix name.
    Returns a new cds database with the longest cds per transcript only."""
    # this is out list of so called longest matches which we will
    # append and remove as applicable
    top_hits = []
    # current sequence length score value "to beat"
    current_length = int(0)
    # set up variables to assgn lastest values to ...
    transcriptome_Genes_names = set([])
    last_gene_name = ""
    last_component = ""
    loop_count = 0
    for seq_record in SeqIO.parse(filename1,
                                  "fasta"):
        sequence_len = len(seq_record)
        sequence_name = seq_record.id
        component = strip_to_match_transcript_name(sequence_name)
        # first time we see any record, save the values:
        if loop_count == 0:
            loop_count = loop_count + 1
            last_gene_name = sequence_name
            current_length = sequence_len
            last_component= component
            top_hits.append(seq_record.id)
        #########################################
        # first block: if the names are the same,
        # is the new length of sequence longer?
        if component == last_component:
            # print ("yes:", component, "component",  last_component,
            # "last_component", seq_record.id)
            # print ("current_length", current_length)
            if sequence_len > current_length:
                # print ("sequence_len > current_length", sequence_len,
                         #current_length)
                del top_hits[-1]
                top_hits.append(seq_record.id)
        ##########################################################
        # second block: if the name is new, put it in the name set.
        # use this sequence-length as the new one to "beat"
        else:
            top_hits.append(seq_record.id)
            last_gene_name = sequence_name
            current_length = sequence_len
            last_component= component
    outfile = open(out_filename, "w")
    for i in top_hits:
        seq_record = cds_database[i]
        SeqIO.write(seq_record, outfile,
                    "fasta")
    outfile.close()
    cds_database_new = SeqIO.index(out_filename,
                                   "fasta",
                                   key_function=strip_to_match_transcript_name)
    return cds_database_new
Exemplo n.º 14
0
 def setUp(self):
     self.aln_file = [TEST_ALIGN_FILE1,
                      TEST_ALIGN_FILE2,
                      TEST_ALIGN_FILE3,
                      TEST_ALIGN_FILE4,
                      TEST_ALIGN_FILE5,
                      TEST_ALIGN_FILE6]
     alns = []
     for i in self.aln_file:
         if i[1] == 'parse':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet)
         elif i[1] == 'index':
             nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20)
         elif i[1] == 'id':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with open(i[0][2]) as handle:
                 id = dict((i.split()[0], i.split()[1]) for i in handle)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet)
         alns.append(caln)
         nucl.close()  # Close the indexed FASTA file
     self.alns = alns
Exemplo n.º 15
0
def blastclust_to_fasta(infname, seqfname, outdir):
    """Converts input BLASTCLUST output list to a subdirectory of FASTA files.


    Each individual FASTA file contains all sequences from a single cluster.
    The sequences matching the IDs listed in the BLASTCLUST output .lst file 
    should all be found in the same file.

    Returns the output directory and a list of the files, as a tuple.
    """
    outdirname = os.path.join(outdir, "blastclust_OTUs")
    if not os.path.exists(outdirname):
        os.makedirs(outdirname)
    seqdict = SeqIO.index(seqfname, 'fasta')
    outfnames = []
    with open(infname, 'r') as fh:
        otu_id = 0
        for line in fh:
            otu_id += 1
            outfname = os.path.join(outdirname,
                                    "blastclust_OTU_%06d.fasta" % otu_id)
            SeqIO.write((seqdict[key] for key in line.split()),
                        outfname, 'fasta')
            outfnames.append(outfname)
    return (outdirname, outfnames)
Exemplo n.º 16
0
def rename_seq_id_to_bin_id(bin_fa_fns, header_prefix="scaffold", sample_ids_abbrev={"GZ-Xyl_Y2":"GX2", "GZ-Xyl_Y1":"GX1", "GZ-Seed_Y0":"GS0", "GZ-Cell_Y1":"GC1", "GZ-Cell_Y2":"GC2", "SWH-Xyl_Y2":"SX2", "SWH-Xyl_Y1":"SX1", "SWH-Seed_Y0":"SS0", "SWH-Cell_Y1":"SC1", "SWH-Cell_Y2":"SC2", "SWH-Cell55_Y2":"S52"}):
    print("Combining bin files (" + str(len(bin_fa_fns)) + ")")
    
    bin_id_map = {}
    for bin_fa_fn in bin_fa_fns:
        print("Reading from " + bin_fa_fn)
        sample_id = os.path.basename(bin_fa_fn)
        bin_id = sample_id[::-1].split(".", 1)[1][::-1]
        sample_id = bin_id.split(".", 1)[0]
        new_sample_id = sample_ids_abbrev[sample_id]
        print(sample_id + " to " + new_sample_id)
        if sample_id in sample_ids_abbrev.keys():
            bin_id = bin_id.replace(sample_id, new_sample_id)
            #bin_id = bin_id.replace(".", "_")
            
            print("bin_id=" + bin_id)
            
            seqs = SeqIO.index(bin_fa_fn, "fasta")
            seq_ids = list(seqs.keys())
            
            m = {seq_id.replace(header_prefix, new_sample_id):bin_id for seq_id in seq_ids}
            bin_id_map.update(m)
        else:
            print(sample_id + " does not exist in the provided sample id list.")
    
    return bin_id_map
Exemplo n.º 17
0
def extract_species_info_from_fasta(fa_fn, out_fn=None):
    import re
    from Bio import SeqIO
    
    print("Processing " + fa_fn)
    
    seqs = SeqIO.index(fa_fn, "fasta")
    seq_ids = list(seqs.keys())
    
    unknown_n = 0
    info_n = 0
    
    if out_fn is None:
        out_fn = fa_fn + ".gi"
    info_map = {}
    for seq_id in seq_ids:
        gi = seq_id.split("|")[1]
        desc = seqs[seq_id].description
        species = re.findall("\[(.+)\]", desc)
        if len(species) == 1:
            species = species[0]
            info_n = info_n + 1
        else:
            species = "Unknown"
            unknown_n = unknown_n + 1
        info_map[gi] = species

    sorted_map = sorted(info_map.items(), key=lambda x:x[1])
    OUT = open(out_fn, "w")
    for (gi, species) in sorted_map:   
        OUT.write(gi+"\t"+species+"\n")
    OUT.close()
    
    print("Export: " + str(info_n) + " (Unknown:" + str(unknown_n) + ")")
def __main__():
	#Parse Command Line
	parser = optparse.OptionParser()
	parser = optparse.OptionParser(usage="python %prog [options]\n\nProgram designed by Guillaume MARTIN : [email protected]\n\n"
	"This script create junctions between scaffolds using a tabulated file.\n"
	"The input tabulated file look as followed:\n"
	">chr1\n"
	"scaffold1	FWD\n"
	"scaffold2	FWD\n"
	"scaffold3	REV\n"
	">...\n")
	# Wrapper options. 
	parser.add_option( '', '--table', dest='table', default='not_filled', help='The table file of scaffold to join')
	parser.add_option( '', '--fasta', dest='fasta', default='not_filled', help='The multi-fasta scaffold file')
	parser.add_option( '', '--out', dest='out', default='super_contig.fasta', help='The multi-fasta output file name, [default: %default]')
	parser.add_option( '', '--out_verif', dest='out_verif', default='contig2verif.txt', help='The output file to give to verif_fusion.py, [default: %default]')
	(options, args) = parser.parse_args()
	
	
	
	#verifying file	
	verif(options.table)
	
	#creating the scaffolds
	dico_fait = scaff(options.table, options.fasta, options.out, options.out_verif)
	
	#printing the remaining scaffold
	record_dict = SeqIO.index(options.fasta, "fasta")
	outfile = open(options.out,'a')
	for n in record_dict:
		if not(n in dico_fait):
			SeqIO.write(record_dict[n], outfile, "fasta")
	outfile.close()
Exemplo n.º 19
0
 def simple_check(self, filename, format, alphabet) :
     id_list = [rec.id for rec in \
                SeqIO.parse(open(filename), format, alphabet)]
     rec_dict = SeqIO.index(filename, format, alphabet)
     self.assertEqual(set(id_list), set(rec_dict.keys()))
     #This is redundant, I just want to make sure len works:
     self.assertEqual(len(id_list), len(rec_dict))
     #Make sure boolean evaluation works
     self.assertEqual(bool(id_list), bool(rec_dict))
     for key in id_list :
         self.assert_(key in rec_dict)
         self.assertEqual(key, rec_dict[key].id)
         self.assertEqual(key, rec_dict.get(key).id)
     #Check non-existant keys,
     try :
         rec = rec_dict[chr(0)]
         raise ValueError("Accessing a non-existant key should fail")
     except KeyError :
         pass
     self.assertEqual(rec_dict.get(chr(0)), None)
     self.assertEqual(rec_dict.get(chr(0), chr(1)), chr(1))
     #Now check iteritems...
     for key, rec in rec_dict.iteritems() :
         self.assert_(key in id_list)
         self.assert_(isinstance(rec, SeqRecord))
         self.assertEqual(rec.id, key)
     #Now check non-defined methods...
     self.assertRaises(NotImplementedError, rec_dict.values)
     self.assertRaises(NotImplementedError, rec_dict.popitem)
     self.assertRaises(NotImplementedError, rec_dict.pop, chr(0))
     self.assertRaises(NotImplementedError, rec_dict.pop, chr(0), chr(1))
     self.assertRaises(NotImplementedError, rec_dict.clear)
     self.assertRaises(NotImplementedError, rec_dict.__setitem__, "X", None)
     self.assertRaises(NotImplementedError, rec_dict.copy)
     self.assertRaises(NotImplementedError, rec_dict.fromkeys, [])
Exemplo n.º 20
0
    def run_single(self, debug=0):
        warnings.simplefilter('always')
        warnings.warn("Deprecated method: run_BLAST.run_single\nBLAST single sequence, slow!! ", DeprecationWarning)

        print("Running AmiGO:BLAST")

        temp_output = open(self.outfile + "_temp", "w")
        if self.record_index == None:
            self.record_index = SeqIO.index(self.infile, "fasta")

        all_orfs = dict()

        for key in self.record_index:
            print key
            this_seq = GoSequence(key, self.record_index[key].seq)  # Bio.SeqRecord.SeqRecord
            this_seq.blast_AmiGO()
            this_seq.extract_ID()
            this_seq.parse_go_term(self.e_threshold)
#            seq.combined_terms
            self.results[key] = this_seq
            all_orfs[key] = this_seq.combined_terms
#            print this_seq
#            print this_seq.combined_terms
            temp_output.write("%s \t %s\n" % (key, this_seq.combined_terms))
#            temp_output.flush()
#        temp_output.close()

        self.counter = self.create_counter(all_orfs)
#        new_outfile = self.init_output(self.counter,0)
#        self.sample = self.update_sample_from_counters(new_outfile, self.counter)
#       hasattr

        output_csv(self.outfile, self.header, self.counter)
Exemplo n.º 21
0
    def run(self, debug=False):

        print("Running AmiGO:BLAST_Batch")

#        temp_output = open(self.outfile + "_temp", "w")
        if self.record_index == None:
            self.record_index = SeqIO.index(self.infile, "fasta")
            print "BLAST infile:%s" % self.infile
#         print self.wdir
        self.tempfile = self.wdir + "/AmiGO_Record.temp"
        go = GOConnector(seq_record=self.record_index, max_query_size=self.batch_size,
                         e_value_cut_off=self.e_threshold, tempfile=self.tempfile,
                         debug=self.debug)

        go.amigo_batch_mode()


        all_seqs = go.all_seqs
        all_orfs = dict()
        for seq in all_seqs:
            key = seq.seq_id
            self.results[key] = seq
            all_orfs[key] = seq.combined_terms
#            print this_seq
#            print this_seq.combined_terms
#            temp_output.write("%s \t %s\n" % (key, seq.combined_terms))
#            temp_output.flush()
#        temp_output.close()

        self.counter = self.create_counter(all_orfs)
#        new_outfile = self.init_output(self.counter,0)
#        self.sample = self.update_sample_from_counters(new_outfile, self.counter)
#       hasattr

        output_csv(self.outfile, self.header, self.counter)
def create_cdna_intron_annotator(genomic_db, genomic_seqs_fhand):
    'It creates a function that annotates introns in cdna matching with genomic'
    genomic_seqs_fhand = get_fhand(genomic_seqs_fhand)
    genomic_seqs_index = SeqIO.index(genomic_seqs_fhand.name,
                                     guess_seq_file_format(genomic_seqs_fhand))
    def annotate_intron(sequence):
        'It adds the orf to the SeqFeatures'
        if sequence is None:
            return
        try:
            introns = infer_introns_for_cdna(sequence=sequence,
                                          genomic_db=genomic_db,
                                          genomic_seqs_index=genomic_seqs_index)
        except KeyError as error:
            error = str(error).lstrip('u').strip("'")
            if 'not found' in error:
                error += ' in seq file %s, but present in blast db %s' % \
                                           (genomic_seqs_fhand.name, genomic_db)
            raise RuntimeError(error)
        for intron_pos in introns:
            feature = SeqFeature(location=FeatureLocation(intron_pos,
                                                          intron_pos),
                                 type='intron',
                                 qualifiers={'genomic_db':genomic_db})
            sequence.features.append(feature)
        return sequence
    return annotate_intron
Exemplo n.º 23
0
def sort_name(source_file, source_file_type, direction=1):
    """
    Sort sequences by name. 1 is ascending (default) and 0 is descending.
    """

    direction_text = 'ascending' if direction == 1 else 'descending'

    logging.info("Indexing sequences by name: %s", direction_text)

    # Adapted from the Biopython tutorial example.

    # Sort on id
    ids = sorted((rec.id) for rec in SeqIO.parse(source_file,
                                                 source_file_type))

    if direction == 0:
        ids = reversed(ids)

    # SeqIO.index does not handle gzip instances
    if isinstance(source_file, gzip.GzipFile):
        tmpfile = tempfile.NamedTemporaryFile()
        source_file.seek(0)
        tmpfile.write(source_file.read())
        tmpfile.seek(0)
        source_file = tmpfile

    record_index = SeqIO.index(source_file.name, source_file_type)

    for id in ids:
        yield record_index[id]
Exemplo n.º 24
0
def main(argv):
    wd_dir = "."
    aln_fn_ext = "aln"
    aln_fns = glob.glob(wd_dir + "/*." + aln_fn_ext)
    aln_format = "fasta"
    export_fn_ext = "renamed"
    map_fn = "map.lst"
     
    id_map = {}
    for aln_fn in aln_fns:
        print("Processing " + aln_fn)
        seqs = SeqIO.index(aln_fn, aln_format)
        out_fn = aln_fn + "." + export_fn_ext
        with open(out_fn, "w") as OUT:
            for id in seqs.keys():
                cflag_existed = False
                while not cflag_existed:
                    new_id = generate_id(id)
                    if new_id not in id_map.keys():
                        cflag_existed = True
                #print("Maping " + id + " to " + new_id)
                
                id_map[new_id] = id + "\t" + os.path.basename(aln_fn)
                seq = seqs[id]
                seq.id = new_id
                seq.name = new_id
                seq.description = ""
                
                SeqIO.write(seq, OUT, "fasta")
                
    with open(map_fn, "w") as OUT:
        for id in id_map:
            OUT.write(id + "\t" + id_map[id] + "\n")
 def create_biopython_iterator(self, **kwargs):
     from Bio import SeqIO
     print "Generating BioPython sequence index.  This may take a moment...."
     self.fasta = SeqIO.index(kwargs['input'], kwargs['data_type'])
     self.readcount = len(self.fasta)
     self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys()))
     self.read = iter(self.db_values)
Exemplo n.º 26
0
    def ById (self, event):
        box=wx.TextEntryDialog(None, "Enter Sequence ID\nFor Multiple IDs use comma (,) as separator", "Sequence ID", "ID")

        if box.ShowModal() == wx.ID_OK:
            IDlist = box.GetValue()
        inFile = self.globalFile
        self.logger.AppendText("Input file:  "+inFile+"\n"+"Start Time:  "+str(time.asctime())+"\n")
        start_time = time.time()

        FastaFile=SeqIO.index(inFile, "fasta")

        outFile=open(inFile+"_by_ID.fasta", "w")
        
        IDlist=IDlist.replace(" ","")
        IDs=IDlist.split(",")
        for ID in IDs:
            if ID in FastaFile:
                outFile.write(FastaFile.get_raw(ID))
                self.logger.AppendText("Wrote "+inFile+"_by_ID.fasta"+"    "+str(time.asctime())+"\n")
            if ID not in FastaFile:
                self.logger.AppendText(ID+" not present in the Fasta file\n")
        outFile.close()
        end_time=str(time.time() - start_time)
        self.logger.AppendText("Fasta file making by ID is completed\n")
        self.logger.AppendText("Finish Time:  "+str(time.asctime())+"\nTime elapsed:  "+end_time+"  seconds\n")
        self.logger.AppendText("--------------------------------------------------------------------------------------------------------\n\n")
        wx.MessageBox("Fasta file making by ID is completed\nTime elapsed:  "+end_time+"  seconds")
def seq_getter(filename_in, wantedfile, threshold, outfile):
    "script to gt sequences of intereste from a file of wanted genes"

    f= open(outfile, 'w')
    wanted = open(wantedfile, "r")

    names = wanted.readlines()
    #print names
    wanted_data = [line.replace("\t", "").rstrip("\n") for line in names
              if line.strip() != ""]
    name_set = set([])
    for i in wanted_data:
        if not i.startswith("#"):
            i = i.rstrip()
            name_set.add(i)
    #print wanted_data

    cds_database = SeqIO.index(filename_in, "fasta")
    #record = SeqIO.read(filename, "fasta")
    for i in name_set:
        if "\r\n" in i:
            i = i.replace("\r\n","")
        #print i
        seq_record = cds_database[i]
        dashes = seq_record.seq.count("-")
        print 100*(float(dashes)/len(seq_record.seq))
        if 100*(float(dashes)/len(seq_record.seq)) < int(threshold):
            #print 'boomshanka'
            SeqIO.write(seq_record, f, "fasta")
    f.close()
    return True
Exemplo n.º 28
0
def shortrna_regions(mirna_gff, star_csv, seq_file):
    """Return miRNA sequences with corresponding guide and star regions.
    """
    seq_index = SeqIO.index(seq_file, "fasta")
    mirna_seqs = dict()
    with open(star_csv) as in_handle:
        for name, guide, star in csv.reader(in_handle):
            mirna_seqs[name] = (guide.strip(), star.strip())

    for rec in GFF.parse(mirna_gff):
        cur_seq = str(seq_index[rec.id].seq)
        for f in rec.features:
            name = f.qualifiers["ID"][0]
            start, end = (f.location.nofuzzy_start, f.location.nofuzzy_end)
            yield (rec.id, start, end, name)
            #guide, star = mirna_seqs.get(name, ("", ""))
            for seq_name, guide, star in [(n, g, s) for n, (g, s) in
                    mirna_seqs.iteritems() if n.startswith(name)]:
                for find_seq, ext in [(guide, "guide"), (star, "star")]:
                    if find_seq:
                        if f.strand == -1:
                            find_seq = str(Seq(find_seq).reverse_complement())
                        region = cur_seq[start:end]
                        pos = region.find(find_seq)
                        if pos > -1:
                            yield (rec.id, start + pos, start + pos + len(find_seq),
                                    "%s_%s" % (seq_name, ext))
                        else:
                            print f.strand, name, ext, pos, find_seq, region
                            raise NotImplementedError
Exemplo n.º 29
0
def extract_long_reads():
    """Filter fastq to longest reads."""

    parser = argparse.ArgumentParser(description='Extract longest reads from a fastq.')
    parser.add_argument('input',
        help='Input .fastq file.')
    parser.add_argument('output',
        help='Output .fastq file.')
    parser.add_argument('longest', default=10, type=int,
        help='Percentage of longest reads to partition.')
    parser.add_argument('--others', default=None,
        help='Write all other reads to file.')
    args = parser.parse_args()

    record_dict = SeqIO.index(args.input, "fastq")
    ids = list(record_dict.keys())
    lengths = np.fromiter(
        (len(record_dict[i]) for i in ids),
        dtype=int, count=len(ids)
    )
    max_reads = len(ids) * (args.longest / 100)
    longest = np.argpartition(lengths, -max_reads)[-max_reads:]

    SeqIO.write(
        (record_dict[ids[i]] for i in longest),
        args.output, 'fastq'
    )

    if args.others is not None:
        longest = set(longest)
        SeqIO.write(
            (record_dict[ids[i]] for i in range(len(ids)) if i not in longest),
            args.others, 'fastq'
        )
Exemplo n.º 30
0
def sort_length(source_file, source_file_type, direction=1):
    """
    Sort sequences by length. 1 is ascending (default) and 0 is descending.
    """
    direction_text = 'ascending' if direction == 1 else 'descending'

    logging.info('Indexing sequences by length: %s', direction_text)

    # Adapted from the Biopython tutorial example.

    # Get the lengths and ids, and sort on length
    len_and_ids = sorted((len(rec), rec.id)
                         for rec in SeqIO.parse(source_file, source_file_type))

    if direction == 0:
        ids = reversed([seq_id for (length, seq_id) in len_and_ids])
    else:
        ids = [seq_id for (length, seq_id) in len_and_ids]
    del len_and_ids  # free this memory

    # SeqIO.index does not handle gzip instances
    if isinstance(source_file, gzip.GzipFile):
        tmpfile = tempfile.NamedTemporaryFile()
        source_file.seek(0)
        tmpfile.write(source_file.read())
        tmpfile.seek(0)
        source_file = tmpfile

    record_index = SeqIO.index(source_file.name, source_file_type)

    for seq_id in ids:
        yield record_index[seq_id]
Exemplo n.º 31
0
    def simple_check(self, filename, format, alphabet, comp):
        """Check indexing (without a key function)."""
        if comp:
            h = gzip_open(filename, format)
            id_list = [rec.id for rec in SeqIO.parse(h, format, alphabet)]
            h.close()
        else:
            id_list = [
                rec.id for rec in SeqIO.parse(filename, format, alphabet)
            ]

        with warnings.catch_warnings():
            if "_alt_index_" in filename:
                # BiopythonParserWarning: Could not parse the SFF index:
                # Unknown magic number b'.diy' in SFF index header:
                # b'.diy1.00'
                warnings.simplefilter('ignore', BiopythonParserWarning)

            rec_dict = SeqIO.index(filename, format, alphabet)
            self.check_dict_methods(rec_dict, id_list, id_list)
            rec_dict.close()
            del rec_dict

            if not sqlite3:
                return

            # In memory,
            # note here give filenames as list of strings
            rec_dict = SeqIO.index_db(":memory:", [filename], format, alphabet)
            self.check_dict_methods(rec_dict, id_list, id_list)
            rec_dict.close()
            del rec_dict

            # check error conditions
            self.assertRaises(ValueError,
                              SeqIO.index_db,
                              ":memory:",
                              format="dummy")
            self.assertRaises(ValueError,
                              SeqIO.index_db,
                              ":memory:",
                              filenames=["dummy"])

            # Saving to file...
            index_tmp = self.index_tmp
            if os.path.isfile(index_tmp):
                os.remove(index_tmp)

            # To disk,
            # note here we give the filename as a single string
            # to confirm that works too (convience feature).
            rec_dict = SeqIO.index_db(index_tmp, filename, format, alphabet)
            self.check_dict_methods(rec_dict, id_list, id_list)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy
            del rec_dict

            # Now reload it...
            rec_dict = SeqIO.index_db(index_tmp, [filename], format, alphabet)
            self.check_dict_methods(rec_dict, id_list, id_list)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy
            del rec_dict

            # Now reload without passing filenames and format
            # and switch directory to check  paths still work
            index_tmp = os.path.abspath(index_tmp)
            os.chdir(os.path.dirname(filename))
            rec_dict = SeqIO.index_db(index_tmp, alphabet=alphabet)
            self.check_dict_methods(rec_dict, id_list, id_list)
            rec_dict.close()
            rec_dict._con.close()  # hack for PyPy
            del rec_dict

            os.remove(index_tmp)
Exemplo n.º 32
0
import sys
import os
import os.path
import shutil
import subprocess
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

Barrnap = "/home/j/jparkins/mobolaji/Tools/Barrnap/bin/barrnap"
Infernal = "/home/j/jparkins/mobolaji/Tools/Infernal/infernal-1.1.2-linux-intel-gcc/binaries/cmsearch"
Rfam = "/home/j/jparkins/mobolaji/Databases/Rfam_rRNA.cm"

reference_file = sys.argv[1]
reference_sequences = SeqIO.to_dict(SeqIO.parse(reference_file, "fastq"))
dedeplicated_file = sys.argv[2]
dedeplicated_sequences = SeqIO.index(dedeplicated_file, "fastq")
cluster_file = sys.argv[3]
cluster_map = {}
reduplicated_file = sys.argv[4]
reduplicated_ids = set()
reduplicated_seqs = []

with open(cluster_file, "r") as clustr_read:
    rep = ""
    seq_id = ""
    for line in clustr_read:
        if line.startswith(">"):
            continue
        elif line.startswith("0"):
            rep = line[line.find(">") + 1:line.find("...")]
            seq_id = rep
Exemplo n.º 33
0
#hg38_pairwise_part = record_dict_hg38["NC_000001.11"].seq[1622920:1634474].upper()
#hg38_pairwise_part = record_dict_hg38["NC_000001.11"].seq[1626620:1642639].upper()

#hg37 = SeqIO.parse("/proj/ncgenes2/src/ncgenes2-exome-pipeline/modules/apps/human-genome-for-alignment/grch37/Homo_sapiens.GRCh37.dna_sm.primary_assembly.refseqids.fa", "fasta")
#hg37_pairwise_part = record_dict_hg37["NC_000001.10"].seq[1558300:1569850].upper()
#hg37_pairwise_part = record_dict_hg37["NC_000001.10"].seq[1562000:1578000].upper()

#alignments_g = pairwise2.align.globalxx(hg38_pairwise_part, hg37_pairwise_part)
#print(format_alignment(*alignments_g[0]))

#alignments_l = pairwise2.align.localxx(hg38_pairwise_part, hg37_pairwise_part)
#print(format_alignment(*alignments_l[0]))

data = pd.read_csv('data.csv')
record_dict_hg38 = SeqIO.index(
    "/proj/ncgenes2/src/ncgenes2-exome-pipeline/modules/apps/human-genome-for-alignment/1405.15/GRCh38_no_alt_analysis_set.refseqids.fna",
    "fasta")
record_dict_hg37 = SeqIO.index(
    "/proj/ncgenes2/src/ncgenes2-exome-pipeline/modules/apps/human-genome-for-alignment/grch37/Homo_sapiens.GRCh37.dna_sm.primary_assembly.refseqids.fa",
    "fasta")


def hg37_fun(d, dict37):
    #print(d['hg37_chr'], d['hg37_start'], d['hg37_end'])
    for index, row in d.head(n=10).iterrows():
        hg37_p = dict37[row['hg37_chr']].seq[
            int(row['hg37_start']):int(row['hg37_end'])].upper()
        #print("this should be the hg37 sequence")
    return hg37_p

#count reads

inputfile = sys.argv[1]
outputfile = sys.argv[2]
chunksize = int(str(
    sys.argv[3]))  #number of reads in each division of the input file
threads = int(sys.argv[4])
fmt = sys.argv[5]  #input format

subprocess.Popen("rm -rf ~/scripts/tmp2", shell=True).wait()
subprocess.Popen("mkdir ~/scripts/tmp2", shell=True).wait()

out1 = open('/OSM/HOME-MEL/all29c/scripts/tmp2/finaloutput.ublast', 'w')

count = SeqIO.index(inputfile, fmt)

c = len(count)

#print c
a = int(c)
print "num reads=", a
if chunksize > a:
    chunksize = a
numchunks = (a / chunksize)

#print"Number of chunks=",numchunks

#sys.stdout.write(str(numchunks))

t = 0
Exemplo n.º 35
0
 def _make_db(self):
     """
     :return: an index of the sequence contains in protfile corresponding to the replicon
     """
     return SeqIO.index(self._prot_file, "fasta", alphabet=Seq.IUPAC.extended_protein)
Exemplo n.º 36
0
    for line in handle:
        if line.strip() and not line.startswith("#"):
            field = line.rstrip("\n").split("\t")[col].strip()
            parts = field.split(None, 1)
            if len(parts) > 1 and not warn:
                warn = "WARNING: Some of your identifiers had white space in them, " + \
                       "using first word only. e.g.:\n%s\n" % field
            yield parts[0]
    handle.close()
    if warn:
        sys.stderr.write(warn)


# Index the sequence file.
# If very big, could use SeqIO.index_db() to avoid memory bottleneck...
records = SeqIO.index(in_file, seq_format)
print("Indexed %i sequences" % len(records))

if seq_format.lower() == "sff":
    # Special case to try to preserve the XML manifest
    try:
        from Bio.SeqIO.SffIO import SffWriter
    except ImportError:
        sys.exit("Requires Biopython 1.54 or later")

    try:
        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        # Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
Exemplo n.º 37
0
    description=
    'Extract a subset of unique sequences, starting from a FASTA file \
                                 containing the parent sequences',
)

parser.add_argument('--fasta_file',
                    nargs=1,
                    required=True,
                    help='FASTA file to be processed')
parser.add_argument('--output_file',
                    nargs=1,
                    required=True,
                    help='Output file')
args = parser.parse_args()

sequences_dict = SeqIO.index(args.fasta_file[0], 'fasta')
sequences = sorted(sequences_dict.keys())
print "Loaded {0:s} sequences".format(str(len(sequences)))

unique_seqs = []
duplicated_seqs = []

i = 0
j = 0

# Iterate over sequences ids
while i < len(sequences):

    # Extract master sequence for comparison
    master_seq = sequences_dict[sequences[i]]
    duplicated_seqs = []
def PPHMMSignatureTable_Constructor(
    SeqIDLists,
    GenBankFile,
    TranslTableList,
    HMMER_PPHMMDB,
    HMMER_hmmscanDir,
    HMMER_N_CPUs=7,
    HMMER_C_EValue_Cutoff=1E-3,
    HMMER_HitScore_Cutoff=0,
    SeqLength_Cutoff=0,
):
    #Load GenBank record
    #---------------------------------------------------------------------
    _, file_extension = os.path.splitext(GenBankFile)
    if file_extension in [".fas", ".fasta"]:
        Records_dict = SeqIO.index(GenBankFile, "fasta")
    elif file_extension in [".gb"]:
        Records_dict = SeqIO.index(GenBankFile, "genbank")

    Records_dict = {k.split(".")[0]: v for k, v in Records_dict.iteritems()}
    N_Seq = len(SeqIDLists)

    #Specify PPHMMQueryFile and PPHMMScanOutFile
    #---------------------------------------------------------------------
    PPHMMDB_Summary = HMMER_PPHMMDB + "_Summary.txt"
    N_PPHMMs = LineCount(PPHMMDB_Summary) - 1
    PPHMMQueryFile = HMMER_hmmscanDir + "/QProtSeqs.fasta"
    PPHMMScanOutFile = HMMER_hmmscanDir + "/PPHMMScanOut.txt"

    PPHMMSignatureTable = np.empty((0, N_PPHMMs))
    PPHMMLocMiddleBestHitTable = np.empty((0, N_PPHMMs))

    Seq_i = 0.0
    for SeqIDList, TranslTable in zip(SeqIDLists, TranslTableList):
        GenBankSeqList = []
        GenBankIDList = []
        GenBankDescList = []
        for SeqID in SeqIDList:
            GenBankRecord = Records_dict[SeqID]
            GenBankSeqList.append(GenBankRecord.seq)
            GenBankIDList.append(GenBankRecord.id)
            GenBankDescList.append(GenBankRecord.description)

        #sort lists by sequence/segment lengthes
        #---------------------------------------------------------------------
        (GenBankSeqLenList, GenBankSeqList, GenBankIDList,
         GenBankDescList) = zip(
             *sorted(zip(map(len, map(str, GenBankSeqList)), GenBankSeqList,
                         GenBankIDList, GenBankDescList),
                     reverse=True))
        GenBankSeq = ""
        for seq in GenBankSeqList:
            GenBankSeq = GenBankSeq + seq

        if len(
                GenBankSeq
        ) >= SeqLength_Cutoff:  #limit the sequence by length; 0=include sequences of all lengths
            GenBankID = "/".join(GenBankIDList)
            GenBankDesc = "/".join(GenBankDescList)
            ProtSeq1 = SeqRecord(GenBankSeq[0:].translate(table=TranslTable),
                                 id=GenBankID + '_+1')
            ProtSeq2 = SeqRecord(GenBankSeq[1:].translate(table=TranslTable),
                                 id=GenBankID + '_+2')
            ProtSeq3 = SeqRecord(GenBankSeq[2:].translate(table=TranslTable),
                                 id=GenBankID + '_+3')
            ProtSeqC1 = SeqRecord(
                GenBankSeq.reverse_complement()[0:].translate(
                    table=TranslTable),
                id=GenBankID + '_-1')
            ProtSeqC2 = SeqRecord(
                GenBankSeq.reverse_complement()[1:].translate(
                    table=TranslTable),
                id=GenBankID + '_-2')
            ProtSeqC3 = SeqRecord(
                GenBankSeq.reverse_complement()[2:].translate(
                    table=TranslTable),
                id=GenBankID + '_-3')

            ProtSeq6frames = ProtSeq1 + ProtSeq2 + ProtSeq3 + ProtSeqC1 + ProtSeqC2 + ProtSeqC3
            ProtSeq6frames.id = GenBankID
            with open(PPHMMQueryFile, "w") as PPHMMQuery_txt:
                SeqIO.write(ProtSeq6frames, PPHMMQuery_txt, "fasta")

            p = subprocess.Popen(
                "hmmscan --cpu %s --noali --nobias --domtblout %s %s %s" %
                (HMMER_N_CPUs, PPHMMScanOutFile, HMMER_PPHMMDB,
                 PPHMMQueryFile),
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                shell=True)
            out, err = p.communicate()

            PPHMMIDList = []
            PPHMMScoreList = []
            FeatureFrameBestHitList = []
            FeatureLocFromBestHitList = []
            FeatureLocToBestHitList = []
            FeatureDescList = []
            with open(PPHMMScanOutFile, "r") as PPHMMScanOut_txt:
                for Line in PPHMMScanOut_txt:
                    if Line[0] != "#":
                        Line = Line.split()
                        Line[22] = " ".join(
                            Line[22:]
                        )  #Concatenate the cluster description back
                        Line = Line[:23]
                        C_EValue = float(Line[11])
                        HitScore = float(Line[7])
                        OriAASeqlen = float(len(GenBankSeq)) / 3
                        if C_EValue < HMMER_C_EValue_Cutoff and HitScore > HMMER_HitScore_Cutoff:
                            #Determine the frame and the location of the hit
                            #------------------------------------------------------
                            ID = int(Line[0].split('_')[-1])
                            HitFrom = int(Line[17])
                            HitTo = int(Line[18])
                            HitMid = float(HitFrom + HitTo) / 2
                            if np.ceil(HitMid / OriAASeqlen) <= 3:
                                Frame = int(np.ceil(HitMid / OriAASeqlen))
                            else:
                                Frame = int(-(np.ceil(HitMid / OriAASeqlen) -
                                              3))
                            LocFrom = int(HitFrom % OriAASeqlen)
                            if LocFrom == 0:  #if the hit occurs preciously from the end of the sequence
                                LocFrom = int(OriAASeqlen)
                            LocTo = int(HitTo % OriAASeqlen)
                            if LocTo == 0:  #if the hit occurs preciously to the end of the sequence
                                LocTo = int(OriAASeqlen)
                            if LocTo < LocFrom:  #The hit (falsely) spans across sequences of different frames
                                if np.ceil(HitFrom / OriAASeqlen) <= 3:
                                    HitFrom_Frame = int(
                                        np.ceil(HitFrom / OriAASeqlen))
                                else:
                                    HitFrom_Frame = int(
                                        -(np.ceil(HitFrom / OriAASeqlen) - 3))
                                if np.ceil(HitTo / OriAASeqlen) <= 3:
                                    HitTo_Frame = int(
                                        np.ceil(HitTo / OriAASeqlen))
                                else:
                                    HitTo_Frame = int(
                                        -(np.ceil(HitTo / OriAASeqlen) - 3))
                                if Frame == HitFrom_Frame:
                                    LocTo = int(OriAASeqlen)
                                elif Frame == HitTo_Frame:
                                    LocFrom = int(1)
                                elif HitFrom_Frame != Frame and Frame != HitTo_Frame:
                                    LocFrom = int(1)
                                    LocTo = int(OriAASeqlen)
                                else:
                                    print(
                                        "Something is wrong with the his location determination"
                                    )
                                    raw_input("Press any key to continue")
                            if ID not in PPHMMIDList:
                                Best_C_EValue = C_EValue
                                PPHMMIDList.append(ID)
                                PPHMMScoreList.append(HitScore)
                                FeatureDescList.append(Line[22].split('|')[0])

                                FeatureFrameBestHitList.append(Frame)
                                FeatureLocFromBestHitList.append(LocFrom * 3)
                                FeatureLocToBestHitList.append(LocTo * 3)
                            else:
                                if C_EValue < Best_C_EValue:
                                    Best_C_EValue = C_EValue
                                    FeatureFrameBestHitList[-1] = Frame
                                    FeatureLocFromBestHitList[-1] = LocFrom * 3
                                    FeatureLocToBestHitList[-1] = LocTo * 3

            FeatureLocMiddleBestHitList = np.zeros(N_PPHMMs)
            FeatureLocMiddleBestHitList[PPHMMIDList] = np.mean(
                np.array([FeatureLocFromBestHitList, FeatureLocToBestHitList]),
                axis=0
            ) * (
                np.array(FeatureFrameBestHitList) /
                abs(np.array(FeatureFrameBestHitList))
            )  #Absolute coordinate with orientation info encoded into it: +ve if the gene is present on the (+)strand, otherwise -ve
            PPHMMLocMiddleBestHitTable = np.vstack(
                (PPHMMLocMiddleBestHitTable, FeatureLocMiddleBestHitList))

            FeatureValueList = np.zeros(N_PPHMMs)
            FeatureValueList[PPHMMIDList] = PPHMMScoreList
            PPHMMSignatureTable = np.vstack(
                (PPHMMSignatureTable, FeatureValueList))

        Seq_i = Seq_i + 1.0

        #Progress bar
        sys.stdout.write(
            "\033[K" +
            "Generate PPHMM signature and location profiles: [%-20s] %d/%d profiles"
            % ('=' * int(Seq_i / N_Seq * 20), Seq_i, N_Seq) + "\r")
        sys.stdout.flush()

    sys.stdout.write("\033[K")
    sys.stdout.flush()
    return (PPHMMSignatureTable, PPHMMLocMiddleBestHitTable)
Exemplo n.º 39
0
    start of the region - the start base will be included and this is 1-indexed (i.e. use start=1 to start at the beginning of the contig)
    end of the region - the end is included in the output
    ploidy (This script obviously only makes sense if variants are phased. One sequence per genome copy will be generated)
    path to the output file. This will be in fasta format.
An 8th arguement can optionally be supplied and should specify the path to a file containing a list of individuals to be included.
If not supplied, all individuals will be used.'''

print 'Alignment-from-vcf, written by Stephan Kamrad ([email protected])'

if len(sys.argv) not in [8, 9]:
    print helpstr
    sys.exit('Invalid arguements supplied')

ref_path = sys.argv[1]
print 'Reading the reference genome: ' + ref_path
ref = SeqIO.index(ref_path, "fasta")

vcf_path = sys.argv[2]
print 'Reading the vcf: ' + vcf_path
vcffile = VariantFile(vcf_path)

contig = sys.argv[3]
start = int(sys.argv[4])
end = int(sys.argv[5])
print 'Getting variants in region %s:%i-%u' % (contig, start, end)
variants = list(vcffile.fetch(contig, start - 1, end))
ref_seq = ref[contig].seq[start - 1:end]

if len(variants) == 0:
    raise Exception('No variants in specified region. Terminating.')
Exemplo n.º 40
0
def prepare_for_ParaAT(joined_df_file_name, id_file, out_dir):
    '''
    input 1:joined_df_file_name
    input 2:id_file
    output 1:out_dir
    '''
    global coding_gene_base, protein_base, gene_fasta_fl, protein_fasta_fl
    gene_dir_path = directory_creater(out_dir / "nucleotide")
    protein_dir_path = directory_creater(out_dir / "aminoacid")
    fasta_base_dir_path = directory_creater(out_dir / "gene_protein_base")
    homolog_dir_path = directory_creater(out_dir / "homolog")
    protein_base_file_path = fasta_base_dir_path / "protein_base.fasta"
    coding_gene_base_file_path = fasta_base_dir_path / "coding_gene_base.fasta"
    if coding_gene_base_file_path.is_file() is False:
        coding_gene_base_list = [
            'cat',
            '/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/70-15_refference_genome/magnaporthe_oryzae_70-15_8_transcripts.fasta',
            "/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/wangzhe2/New_add_ina168/ina168_CDS.fasta"
        ]
        protein_base_list = [
            'cat',
            '/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/70-15_refference_genome/magnaporthe_oryzae_70-15_8_proteins_T0.fasta',
            "/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/wangzhe2/New_add_ina168/ina168_protein.fasta"
        ]
        merge_to_one(coding_gene_base_list, id_file, "_CDS.fasta",
                     fasta_base_dir_path / "coding_gene_list.txt",
                     coding_gene_base_file_path,
                     fasta_base_dir_path / "coding_gene_cat_err.txt")
        merge_to_one(protein_base_list, id_file, "_protein.fasta",
                     fasta_base_dir_path / "protein_list.txt",
                     protein_base_file_path,
                     fasta_base_dir_path / "protein_cat_err.txt")
    coding_gene_base = SeqIO.index(str(coding_gene_base_file_path),
                                   "fasta",
                                   key_function=get_id_protein)
    protein_base = SeqIO.index(str(protein_base_file_path),
                               "fasta",
                               key_function=get_id_protein)
    base = importr("base")
    utils = importr("utils")
    ortholog_joined_df = utils.read_table(joined_df_file_name,
                                          sep="\t",
                                          header=True,
                                          **{'stringsAsFactors': False},
                                          **{'check.names': False})
    ortholog_joined_df_sub = ortholog_joined_df.rx(True, -1)
    for i in range(1, (int(base.nrow(ortholog_joined_df)[0]) + 1)):
        df_row = ortholog_joined_df_sub.rx(i, True)
        df_row_iter = iter(df_row)
        head_list = next(df_row_iter)[0].split()
        if len(head_list) == 1:
            gene_fasta = gene_dir_path / (head_list[0] + ".fasta")
            protein_fasta = protein_dir_path / (head_list[0] + ".fasta")
            homolog_file_path = homolog_dir_path / (head_list[0] + ".txt")
            if gene_fasta.is_file() is True: continue
            with gene_fasta.open('w') as gene_fasta_fl:
                with protein_fasta.open('w') as protein_fasta_fl:
                    with homolog_file_path.open('w') as homolog_fl:
                        extract_gene(head_list[0])
                        homolog_fl.write(head_list[0] + "\t")
                        for homolog_id in one_head(df_row_iter):
                            homolog_fl.write(homolog_id + "\t")
                        homolog_fl.write("\n")
        else:
            two_head(head_list, df_row_iter)
Exemplo n.º 41
0
    outfile = open(str(file.split('.')[0]) + '_clean.fa', 'w')
    for line in infile:
        if '>' in line:
            names = line.split()
            for name in names:
                if name[0] == '>':
                    outfile.write(name + '\n')
        else:
            outfile.write(line)

    outfile.close()
    infile.close()


record_dict = SeqIO.index(
    argv[1], "fasta"
)  # "parses" a fasta file, creating a dictionary-like object of sequences. not everything is kept in memeory. instead it just records where each record is within the file. parses on demand.
# the key is the dictionary is the ">" line in the fasta file
min_pro_len = int(
    argv[2]
)  # minimum protein length, within typical VSG protein length, in a.a.

contig_outfile = open(argv[3],
                      'w')  # initializing contig output file, reference

ORF_outfile = open(
    argv[4], 'w')  # initializing open reading frame output file, what we like

noduplicate = []  # initialize array/list

orf_dict = {}  # initialize dictionary
ANNOT_INFILE = sys.argv[5]
if CSV: OUTFILE = './reference.csv'
OUTFILE = './reference.fasta'
hya_annots_dict = {}
max_length_dict = {}

avg_cov_dict = dict(line.strip().split('\t') for line in file(AVG_INFILE))
perc_cov_dict = dict(
    ((line.strip().split('\t'))[0], (line.strip().split('\t'))[3])
    for line in file(ZERO_INFILE))
blastn_matches_dict = dict(
    (line.strip().split('\t'))[0:2] for line in file(BLASTN_INFILE))
apalm_annotations_dict = dict(
    (line.strip().split('\t'))[0:3:2] for line in file(ANNOT_INFILE))
# gene_name : SeqRecord of gene seq
record_dict = SeqIO.index(REF_INFILE, 'fasta')


def annotation():
    for transcript in blastn_matches_dict:
        # get name format of transcript model that will match apalm annotation dict
        apalm_model = '_'.join(blastn_matches_dict[transcript].split('_')[1:])
        if apalm_model in apalm_annotations_dict:
            annotation = apalm_annotations_dict[apalm_model]
            hya_annots_dict[transcript] = annotation
        else:
            hya_annots_dict[transcript] = 'NONE'


def max_cont_length():
    # split sequence by continuous sequence in genes (i.e. without Ns)
Exemplo n.º 43
0
def readGenome(fasta):
    genome_dict = SeqIO.index(fasta, "fasta")
    print(len(genome_dict))
    return (genome_dict)
Exemplo n.º 44
0
        align_format = "fasta"  #could be clustal
        prot_align_file = full_path_to_file
        nuc_fasta_file = "$HOME/scratch/BUSCO_cegma/gene_models/all_nt/all_nt.fasta"
        name_out = "./back_translated/%s.DNA_backtranslated.fasta" % (
            name[:-5])
        nuc_align_file = name_out
        table = 1
        try:
            table = int(table)
        except:
            stop_err("Bad table argument %r" % table)

        prot_align = AlignIO.read(prot_align_file,
                                  align_format,
                                  alphabet=generic_protein)
        nuc_dict = SeqIO.index(nuc_fasta_file, "fasta")
        nuc_align = alignment_back_translate(prot_align,
                                             nuc_dict,
                                             gap="-",
                                             table=table)
        AlignIO.write(nuc_align, nuc_align_file, align_format)

##if len(sys.argv) == 4:
##    align_format, prot_align_file, nuc_fasta_file = sys.argv[1:]
##    nuc_align_file = sys.stdout
##    table = 0
##elif len(sys.argv) == 5:
##    align_format, prot_align_file, nuc_fasta_file, nuc_align_file = sys.argv[1:]
##    table = 0
##elif len(sys.argv) == 6:
##    align_format, prot_align_file, nuc_fasta_file, nuc_align_file, table = sys.argv[1:]
Exemplo n.º 45
0
def downsample(in_metadata, out_metadata, in_fasta, out_fasta, max_diff,
               outgroup_file, downsample_date_excluded, downsample_included,
               downsample_lineage_size):
    original_num_seqs = 0
    sample_dict = {}
    var_dict = {}

    count_dict, num_samples = get_count_dict(in_metadata)
    most_frequent = get_by_frequency(count_dict, num_samples, band=[0.05, 1.0])
    very_most_frequent = get_by_frequency(count_dict,
                                          num_samples,
                                          band=[0.5, 1.0])

    lineage_dict = get_lineage_dict(in_metadata, downsample_lineage_size)

    outgroups = parse_outgroups(outgroup_file)
    indexed_fasta = SeqIO.index(in_fasta, "fasta")

    with open(in_metadata, 'r', newline = '') as csv_in, \
        open(out_fasta, 'w', newline = '') as fa_out, \
        open(out_metadata, 'w', newline = '') as csv_out:

        reader = csv.DictReader(csv_in,
                                delimiter=",",
                                quotechar='\"',
                                dialect="unix")
        writer = csv.DictWriter(csv_out,
                                fieldnames=reader.fieldnames,
                                delimiter=",",
                                quotechar='\"',
                                quoting=csv.QUOTE_MINIMAL,
                                dialect="unix")
        writer.writeheader()

        for row in reader:
            fasta_header = row["sequence_name"]
            if fasta_header not in indexed_fasta:
                continue
            if original_num_seqs % 1000 == 0:
                now = datetime.datetime.now()
                print("%s Downsampled from %i seqs to %i seqs" %
                      (str(now), original_num_seqs, len(sample_dict)))
            original_num_seqs += 1

            if fasta_header in outgroups or not should_downsample_row(
                    row, downsample_date_excluded, downsample_included,
                    downsample_lineage_size, lineage_dict):
                if fasta_header in outgroups:
                    row["why_excluded"] = ""
                writer.writerow(row)
                if row["why_excluded"] in [None, "None", ""
                                           ] and fasta_header in indexed_fasta:
                    seq_rec = indexed_fasta[fasta_header]
                    fa_out.write(">" + seq_rec.id + "\n")
                    fa_out.write(str(seq_rec.seq) + "\n")
                else:
                    print(row["why_excluded"], fasta_header,
                          (fasta_header in indexed_fasta))
                continue

            muts = row["nucleotide_variants"].split("|")
            if len(muts) < max_diff:
                #if not row["why_excluded"]:
                #    row["why_excluded"] = "downsampled with diff threshold %i" %max_diff
                writer.writerow(row)
                continue

            found_close_seq = False

            samples = set()
            low_frequency_muts = [
                mut for mut in muts if mut not in most_frequent
            ]
            if len(low_frequency_muts) == 0:
                low_frequency_muts = [
                    mut for mut in muts if mut not in very_most_frequent
                ]
                if len(low_frequency_muts) == 0:
                    low_frequency_muts = muts
            if len(low_frequency_muts) > max_diff + 1:
                low_frequency_muts = low_frequency_muts[:max_diff + 1]
            for mut in low_frequency_muts:
                if mut in var_dict:
                    samples.update(var_dict[mut])
            if downsample_lineage_size:
                samples = list(samples & set(lineage_dict[row["lineage"]]))

            for sample in samples:
                if num_unique(muts, sample_dict[sample]) <= max_diff:
                    found_close_seq = True
                    #if not row["why_excluded"]:
                    #    row["why_excluded"] = "downsampled with diff threshold %i" %max_diff
                    writer.writerow(row)
                    break
            if not found_close_seq:
                sample_dict[fasta_header] = muts
                for mut in muts:
                    if mut not in var_dict:
                        var_dict[mut] = [fasta_header]
                    else:
                        var_dict[mut].append(fasta_header)
                row["why_excluded"] = ""
                writer.writerow(row)
                if fasta_header in indexed_fasta:
                    seq_rec = indexed_fasta[fasta_header]
                    fa_out.write(">" + seq_rec.id + "\n")
                    fa_out.write(str(seq_rec.seq) + "\n")

    now = datetime.datetime.now()
    print("%s Downsampled from %i seqs to %i seqs" %
          (str(now), original_num_seqs, len(sample_dict)))
    # return sample_dict.keys()


# def main():
#     args = parse_args()
#     subsample = downsample(args.in_metadata, args.out_metadata, args.in_fasta, args.out_fasta, args.diff, args.outgroups, args.downsample_date_excluded, args.downsample_included, args.downsample_lineage_size)

# if __name__ == '__main__':
#     main()
Exemplo n.º 46
0
def write_hash_dict(in_fasta,
                    out_fasta,
                    out_metadata,
                    outgroup_file,
                    in_metadata=None):
    outgroups = parse_outgroups(outgroup_file)
    print(outgroups)
    records = SeqIO.index(in_fasta, "fasta")
    hash_dict = {}

    for record_id in records:
        seq = str(records[record_id].seq)
        if seq in hash_dict:
            hash_dict[seq] = hash_dict[seq] + [record_id]
        else:
            hash_dict[seq] = [record_id]
    print("Found %i unique fasta sequences" % len(hash_dict))

    date_dict = {}
    if in_metadata is not None:
        with open(in_metadata, 'r', newline='') as csv_in:
            reader = csv.DictReader(csv_in,
                                    delimiter=",",
                                    quotechar='\"',
                                    dialect="unix")
            for row in reader:
                if row["epi_week"] not in [None, "None", ""]:
                    date_dict[row["sequence_name"]] = int(row["epi_week"])
        print("Found", len(date_dict), "epi_week dates")

    with open(out_fasta, "w") as fasta, open(out_metadata, "w") as metadata:
        metadata.write("tip,redundant\n")

        for key, value in hash_dict.items():
            if len(value) == 1:
                r = records[value[0]]
                fasta.write(">" + r.id + "\n")
                fasta.write(str(r.seq) + "\n")

            elif len(value) > 1:
                r = None
                for id in value:
                    if id in outgroups:
                        r = records[id]
                        fasta.write(">" + r.id + "\n")
                        fasta.write(str(r.seq) + "\n")
                        value.remove(id)

                if not r:
                    index = 0
                    date = 0
                    if in_metadata is not None:
                        for i, id in enumerate(value):
                            if id in date_dict and date_dict[id] > date:
                                index = i
                                date = date_dict[id]
                    r = records[value[index]]
                    fasta.write(">" + r.id + "\n")
                    fasta.write(str(r.seq) + "\n")
                    value.remove(value[index])

                metadata.write(r.id + ",")
                metadata.write("|".join(value) + "\n")
Exemplo n.º 47
0
            "WARNING! BigWig file does not allow overlapped items. A wiggle file was generated instead.\n"
        )
        new_output_format = "wiggle"

    if output_format != "wiggle" and output_file is None:
        sys.stderr.write(
            "WARNING! An output filename is needed to save output as {}. "
            "The result is shown below:\n".format(output_format))
        error.append(
            "WARNING! An output filename is needed to save output as {}. "
            "The result is shown above.\n".format(output_format))
        new_output_format = "wiggle"

    output_format = new_output_format

    records = SeqIO.index(input_file, "fasta")
    records_num = len(records)
    write_content = generate_write_content()
    if records_num < 1:
        # No sequence in fasta file, corrupted
        sys.stdout.write(
            "WARNING! {} contains no sequence data.\n".format(input_file))
        raise TypeError
    elif records_num == 1 or one_file:
        result = open_results_file()
        # one sequence in fasta file or one output file for all sequences
        for record in SeqIO.parse(input_file, "fasta"):
            write_title()
            generate_result()
        result.close()
    else:
Exemplo n.º 48
0
"""Coronavirus Variant analysis"""
import pkg_resources, os, pandas
from Bio import SeqIO
from Bio.Data.CodonTable import unambiguous_dna_by_id as codon_table
# path to data files internally used by cova
DATAPATH = pkg_resources.resource_filename('cova', 'data')
# feature table
FEATURETABLE = pandas.read_csv(os.path.join(DATAPATH,
                                            'feature_table_modified.txt'),
                               sep='\t',
                               index_col=0)
# proteins' sequence data
PSEQS = SeqIO.index(os.path.join(DATAPATH, 'protein_extended.faa'), 'fasta')
# CDS' sequence data
CDS = SeqIO.index(os.path.join(DATAPATH, 'cds_extended.fna'), 'fasta')
# reference genome sequence data
GENOME = SeqIO.read(handle=os.path.join(DATAPATH, 'genome.fna'),
                    format='fasta')
# reference accession
REF = GENOME.id.split('.')[0]
# codon table
CODONTABLE = codon_table[1]
# proteins affected by ribosomal slippage
RFS = pandas.read_csv(os.path.join(DATAPATH, 'ribosomal_slippage.csv'),
                      index_col=0)

# list of input/output file and directory names used and generated by CoVa
with open(os.path.join(DATAPATH, 'cova_io_files.txt')) as flob:
    IOF = [i.strip('\n') for i in flob]

# data for sequence typing
Exemplo n.º 49
0
    elif miss_cleavage == 2:
        for j in range(0, len(cut_sites) - 3):
            peptides.append(proseq[cut_sites[j]:cut_sites[j + 1]])
            peptides.append(proseq[cut_sites[j]:cut_sites[j + 2]])
            peptides.append(proseq[cut_sites[j]:cut_sites[j + 3]])

        peptides.append(proseq[cut_sites[-3]:cut_sites[-2]])
        peptides.append(proseq[cut_sites[-3]:cut_sites[-1]])
        peptides.append(proseq[cut_sites[-2]:cut_sites[-1]])

    return peptides


handle1 = SeqIO.parse(sys.argv[1], 'fasta')  # All_COSMIC_Genes.fasta
handle2 = SeqIO.index(sys.argv[2], 'fasta')  # Cosmic_mutant_proteins.fasta
output = open(sys.argv[3], 'w')

peptidome = {}

for record in handle1:
    cds_seq = record.seq
    aa_seq = cds_seq.translate(to_stop=True, stop_symbol="")
    peptide_list = TRYPSIN(str(aa_seq), 0)
    for peptide in peptide_list:
        if len(peptide) in range(6, 41):
            if peptide not in peptidome:
                peptidome[peptide.replace("I", "L")] = 1

print len(peptidome)
from sys import argv
import re
from Bio import SeqIO

script, filename1, filename2, filename3 = argv

p = open(filename1)  # HMMTOP output file
record_dict = SeqIO.index(filename2, "fasta")  # FASTA file
outfile = open(filename3, "w+")  # Filename for extracted sequences


# keep the entire sequence if it has 3 < TM < 10
def seqextract(file):
    for line in file:
        ids = []
        line = line.strip()
        # print line
        match = re.search('>HP:\s+\d+\s+(.*?)\s.*?[IN|OUT]\s+(\d+)', line)
        if match and int(match.group(2)) > 3 and match and int(match.group(2)) < 10:
            protein_id = match.group(1)
            print(protein_id + " " + match.group(2))
            current = record_dict[protein_id]
            SeqIO.write(current, outfile, "fasta")
        else:
            pass


def TMseqextract(file):
    for line in file:
        ids = []
        line = line.strip()
Exemplo n.º 51
0
def codons(readfile, hmmerfile, gene):
    """
    """

    dblength = dblengths[gene]
    threshold = thresholds[gene]
    coords = _load_hxb2(gene)

    reads = SeqIO.index(readfile, "fasta")
    hmmer = SearchIO.read(hmmerfile, "hmmer3-text")

    counts = dict(
        (hxb2, {})
        for hxb2 in range(coords.hxb2.iloc[0], coords.hxb2.iloc[-1] + 1, 3))

    for hit in hmmer.hits:

        # Skip hits that contain stop codons (indicates wrong frame)
        if "*" in chain(hsp.aln[1].seq for hsp in hit.hsps): continue

        id, _, frame = hit.id.rpartition("-")
        count = int(id.partition("-")[2])

        if frame.endswith("'"):
            seq = str(reads[id].seq.reverse_complement())
            offset = int(frame[:-1])
        else:
            seq = str(reads[id].seq)
            offset = int(frame)

        for hsp in hit.hsps:

            if math.log(
                (dblength * hsp.hit_span) / 2**hsp.bitscore) >= threshold:
                continue

            i = 0  # tracks position in the alignment (0-indexed)
            hmm = hsp.query_start + 1  # tracks position in the HMM reference sequence (1-indexed)
            read = 3 * hsp.hit_start + offset  # tracks position in the read sequence (0-indexed)

            # read/reference sequences should have same length in alignment
            n = len(hsp.aln[0].seq)
            assert len(hsp.aln[1].seq) == n

            # alignment should not start or end with an insertion
            assert hsp.aln[0].seq[0] != "."
            assert hsp.aln[0].seq[n - 1] != "."

            while i < n:

                aa_frame = []
                codon_frame = []
                hxb2 = coords.loc[hmm, "hxb2"]

                aa = hsp.aln[1].seq[i]
                if aa != "-":
                    aa_frame.append(aa)
                    codon_frame.append(seq[read:read + 3])
                    read += 3

                # Extend frame with insertions relative to HMM
                while i < (n - 1) and hsp.aln[0].seq[i + 1] == ".":
                    aa_frame.append(hsp.aln[1].seq[i + 1])
                    codon_frame.append(seq[read:read + 3])
                    assert aa_frame[-1] != "-"
                    read += 3
                    i += 1

                # Extend frame with deletions relative to HXB2
                for _ in range(coords.loc[hmm, "del"]):

                    if i < (n - 1):
                        aa = hsp.aln[1].seq[i + 1]
                        if aa != "-":
                            aa_frame.append(aa)
                            codon_frame.append(seq[read:read + 3])
                            read += 3
                            i += 1
                            hmm += 1

                    while i < (n - 1) and hsp.aln[0].seq[i + 1] == ".":
                        aa_frame.append(hsp.aln[1].seq[i + 1])
                        codon_frame.append(seq[read:read + 3])
                        assert aa_frame[-1] != "-"
                        read += 3
                        i += 1

                # Assign codons, adjusting coordinates relative to HXB2 insertions
                assert len(aa_frame) == len(codon_frame)
                for _ in range(coords.loc[hmm, "ins"] + 1):
                    # Iterate through the codon frame
                    if codon_frame:
                        codon = codon_frame.pop(0)
                        aa = aa_frame.pop(0)
                        if aa != 'X' and 'N' not in codon:
                            assert str(Seq.translate(codon)) == aa.upper()
                            counts[hxb2][codon] = counts[hxb2].get(codon,
                                                                   0) + count
                    # Deletions occur when the codon frame is empty
                    else:
                        counts[hxb2][""] = counts[hxb2].get("", 0) + count
                    hxb2 += 3

                i += 1
                hmm += 1

    # output
    lines = []
    for hxb2 in sorted(counts):
        for codon in sorted(counts[hxb2], key=counts[hxb2].get, reverse=True):
            lines.append("\t".join(
                [str(hxb2), codon, str(counts[hxb2][codon])]))
    return lines
Exemplo n.º 52
0
    if len(list_traits) > 0:
        print('\nGenomic traits being scanned = ' + str(len(list_traits)) + ':\n')
        for t in list_traits:
            print('\t- ' + t)
    else:
        print('No genomic annotation was found. Stopping mutation detection...\n')
        sys.exit()



    """ CODONS """
    # Read original sequence file, and create list of codons
    dfCodons = pd.DataFrame(columns=['genomes'] + list_traits)

    # search for insertion in reference genome
    seq_index = SeqIO.index(alignment, 'fasta')
    ref_seq = seq_index[ref_genome].seq
    insert_pos = [pos for pos, nuc in enumerate(ref_seq, 1) if nuc == '-']

    insertions = {}
    if len(insert_pos) > 0:
        print('\nInsertions detected in the following positions in \"' + ref_genome + '\":\n')
        cur_idx = 0
        cur_ins = 0
        lag = 0
        print(str(insert_pos))

        for i in insert_pos:
            scanned = [site for list_sites in insertions.values() for site in list_sites]
            if i > cur_ins + 1: # not consecutive
                lag = len(scanned)
Exemplo n.º 53
0
from Bio import SeqIO

(file, id, start, end) = ("secondround_merged_expanded.fasta",
                          "C7136661:0-107", 1, 10)

record_dict = SeqIO.index(file, "fasta")
print record_dict[id].seq[start:end]
Exemplo n.º 54
0
    def writeCDNAresults(self, target, target_folder, outf, contigf):
        """
        This is ONLY called when a cDNA target is finished.

        When doing a cDNA type run, it is very useful to have both the following:
        1) All contigs that belong to a gene (isogroup)
            - It would be particularly good to re-orient these if they are in RC.
        2) Total number of reads assembled in each gene (isogroup)

        Additionally it would be excellent to some day also get the following:
        3) Transcript (isotig) structure
        4) Estimate of isotig specific reads.

        """
        if self.params['assembler'] == 'newbler':
            contigf = os.path.join(self.params['working_dir'], target_folder,
                                   "assembly", "assembly", "454AllContigs.fna")
            isotigsf = os.path.join(self.params['working_dir'], target_folder,
                                    "assembly", "assembly",
                                    "454IsotigsLayout.txt")
            readstatusf = os.path.join(self.params['working_dir'],
                                       target_folder, "assembly", "assembly",
                                       "454ReadStatus.txt")
        else:
            logger.info(
                "WARNING writeCDNAresults called when assembler was not Newbler"
            )
            return None
        if not (os.path.exists(contigf) and os.path.exists(isotigsf)
                and os.path.exists(readstatusf)):
            logger.info("CDNA WARNING MISSING FILE!! %s %s" %
                        (target, self.params['sample']))
            logger.info(contigf, os.path.exists(contigf))
            logger.info(isotigsf, os.path.exists(isotigsf))
            logger.info(readstatusf, os.path.exists(readstatusf))
            return None
        #Storage data structures:
        isogroups = {
        }  # A dict of isogroups which each contain an in-order list of contigs
        readcounts = Counter(
        )  # A dict of all contigs, these contain read counts (from ReadStatus)
        contig_orientation = {}
        contig_to_isogroup = {}
        contig_idx = SeqIO.index(contigf, "fasta")
        # Parse isotigsf:
        igroup = ""
        #print self.params['sample'], target, "Parsing isotigsf: %s" % isotigsf
        for l in open(isotigsf, 'r'):
            #Handle lines with only a '\n'
            if l == '\n':
                pass
            #Handle lines for isogroup:
            elif l[0:9] == '>isogroup':
                igroup = l.strip().split()[0].strip(">")
            #Handle lines containing all contigs:
            elif l.strip().split()[0] == 'Contig':
                l2 = l.strip().split()
                contigs = map(lambda x: "contig" + x, l2[2:-1])
                isogroups[igroup] = contigs
                for contig in contigs:
                    if contig not in contig_orientation:
                        contig_orientation[contig] = '+'
                        contig_to_isogroup[contig] = igroup
                    else:
                        raise exceptions.FatalError(
                            'Contig %s in %s more than once' %
                            (contig, contigf))
            #Handle lines containing contig orientation info:
            elif l[0:6] == 'isotig':
                l2 = l[l.find(" ") + 1:l.rfind(" ") - 1]
                l3 = [l2[i:i + 6] for i in range(0, len(l2), 6)]
                for i in range(len(l3)):
                    if l3[i][0] == '<':
                        # contig is in reverse orientation
                        contig = isogroups[igroup][i]
                        contig_orientation[contig] = '-'
        #print self.params['sample'], target, "Parsed isotigsf, contigs:", len(isogroups), "contig_to_isogroup", len(contig_to_isogroup), "contig_orientation", len(contig_orientation)
        #Now parse readstatus:
        inf = open(readstatusf, 'r')
        inf.readline()  # discard first line
        for l in inf:
            l2 = l.strip().split('\t')
            #Determine if this read was assembled
            if len(l2) == 8:
                contig = l2[2]
                # Note that there are some built in limits to the number of contigs that can be in an isogroup:
                # http://contig.wordpress.com/2010/08/31/running-newbler-de-novo-transcriptome-assembly-i/
                # These won't appear in the IsotigsLayout.txt, but ARE in the ReadStatus.txt file.
                if contig in contig_to_isogroup:
                    readcounts[contig_to_isogroup[contig]] += 1
                else:
                    readcounts['ExceedsThreshold'] += 1
        #print self.params['sample'], target, "Parse read status"

        #Finally, output all of this information appropriately:
        countsf = open(
            os.path.join(self.params['finished_dir'],
                         "isogroup_read_counts.tsv"), 'a')
        sample = self.params['sample']
        #First write out readcounts: sample \t target \t isogroup \t readcount
        for isogroup in readcounts:
            countsf.write('\t'.join(
                [sample, target, isogroup,
                 str(readcounts[isogroup])]) + '\n')
        countsf.close()
        #print self.params['sample'], target, "Wrote readcounts"

        #Next write the contigs in proper order and orientation:
        ncontigs = 0
        nisogroups = 0
        for isogroup in isogroups:
            nisogroups += 1
            for contig in isogroups[isogroup]:
                ncontigs += 1
                seqrec = contig_idx[contig]
                #print self.params['sample'], target, seqrec
                if contig_orientation[contig] == '-':
                    seqrec.seq = seqrec.seq.reverse_complement()
                #print self.params['sample'], target, seqrec
                seqrec.name = seqrec.id = sample + "_:_" + target + "_:_" + isogroup + "|" + contig
                #print self.params['sample'], target, seqrec
                SeqIO.write(seqrec, outf, "fasta")
        ## TODO: add support for the ExceedsThreshold contigs
        logger.info(
            "Sample: %s target: %s iteration: %s Finished writing %s contigs, %s isogroups "
            % (self.params['sample'], target, self.params['iteration'],
               ncontigs, nisogroups))
Exemplo n.º 55
0
                          proteinID=row[2],
                          PSMcount=int(row[3]),
                          ratio=row[4],
                          error=row[5],
                          cluster=row[6])
        peptide.length = len(peptide.seq)
        if gene not in gene_peparray:
            gene_peparray[gene] = [peptide]
        else:
            gene_peparray[gene].append(peptide)

    print 'there are', len(gene_peparray), 'genes in total'
    input_file.close()
    print 'mapping.....'

    record_dict = SeqIO.index('varseq.fa', 'fasta')

    handle = open('splicingvar.txt')
    gene_variant = {}
    variant_dic = {}
    variant_exon = {}
    for line in handle:
        row = line.strip().split("\t")
        exon = EXON(gene=row[1],
                    chr="chr" + row[2],
                    strand=row[3],
                    variant=row[4],
                    number=row[5],
                    start=int(row[6]),
                    end=int(row[7]),
                    trans_start=int(row[8]),
Exemplo n.º 56
0
        if higher == lower: higher = lower + 1
        #print('higher:'+str(higher)+',lower:'+str(lower));
        if higher < 100:
            val = posweight[lower] * (nfrac - lower) + posweight[higher] * (
                higher - nfrac)
        else:
            val = posweight[99]
        kweight += val
        rlenweight.append(kweight)

bedfile = sys.argv[-2]
reffile = sys.argv[-1]
#ofastafile=sys.argv[-1];

# build reference
seqref = SeqIO.index(reffile, 'fasta')
refkeys = list(seqref.keys())

# read bed file, and ready for writing
if bedfile != "-":
    fid = open(bedfile)
else:
    fid = sys.stdin
#ofid=open(ofastafile,'w');
ofid = sys.stdout

nlines = 0

prevchr = ''
previndex = ''
Exemplo n.º 57
0
    for u, v, d in g.edges_iter(data=True):
        if d['weight'] <= args.weight:
            g.remove_edge(u, v)

    for u in g.nodes():
        if g.degree(u) == 0:
            g.remove_node(u)

    print 'Removed edges of {0} weight or less: {1}/{2}'.format(
        args.weight, g.order(), g.size())

if args.enzyme:
    RESTRICTION_BATCH = RestrictionBatch([args.enzyme])
    ENZYME = RestrictionBatch.get(RESTRICTION_BATCH, args.enzyme, add=False)

seqidx = SeqIO.index(args.fasta, 'fasta')
attrib = {}
for si in seqidx:

    if g.has_node(si):
        attr = {'length': len(seqidx[si])}

        if args.enzyme:
            seq_sites = count_sites(seqidx[si])
            seq_sites = seq_sites if seq_sites > 0 else 1
            attr['site'] = seq_sites

with open(args.cover, 'r') as cov_h:
    for l in cov_h:
        if l.startswith('#ID'):
            continue
Exemplo n.º 58
0
def strip_introns(fasta,
                  verb=None,
                  test=False,
                  min_intron_len=35,
                  max_intron_len=10000,
                  multi_species=False,
                  peptide=''):
    # want the chrom (refers to coordinates)
    intron_file = '{}_introns_1.FASTA'.format(fasta[:-6])
    p_head = ''
    if peptide != '':
        peptide_dict = SeqIO.index(peptide, "fasta", key_function=get_pep_id)
        p_head = ' pep'
    headline = '# id chr beg end str n/m len gc ambig?{} seq\n'.format(p_head)
    enough_introns = False

    don_motif = {}
    acc_motif = {}
    dinuc_motif = {}
    dinuc_dist = {}

    with open(fasta) as handle:
        o = open(intron_file, 'w')
        o.write(headline)
        example = 0

        don = {}
        acc = {}
        dinuc = {}
        for seq_record in SeqIO.FastaIO.FastaIterator(handle,
                                                      title2ids=get_exon_id):
            if verb:
                print("Seq Record: " + seq_record.name)
            chrom = re.match('.+chr_name1="([^"]+)"',
                             seq_record.description).group(1)
            if 'scaffold' in chrom:
                if verb:
                    print('Scaffolding skipped!')
                continue

            exon_positions = {}
            pos = ['beg', 'end']
            r = seq_record.name.split('|')
            for i in range(2):
                exon_positions[pos[i]] = [int(x) for x in r[i].split(';')]
            strand = int(
                re.match('.+gene_chrom_strand="([^"]+)"',
                         seq_record.description).group(1))
            species = re.match('.+organism_name="([^"]+)"',
                               seq_record.description).group(1)
            if verb:
                print('strand: ', strand)
            start = int(
                re.match('.+transcript_chrom_start="([^"]+)"',
                         seq_record.description).group(1))

            intron_count = len(exon_positions['beg']) - 1  # Is this right?
            if verb:
                print('Exons:')
                for i in range(0, intron_count + 1):
                    print('{} - b: {} e: {}'.format(i + 1,
                                                    exon_positions['beg'][i],
                                                    exon_positions['end'][i]))
                    # print ('There should be {} introns.'.format(intron_count))

            intron_positions = {'beg': [], 'end': []}
            if verb:
                print('Introns: ')
            for i in range(1,
                           intron_count + 1):  # Strand represented by 1 or -1
                # if strand > 0:
                intron_positions['beg'].append(exon_positions['end'][i - 1] +
                                               1)
                intron_positions['end'].append(exon_positions['beg'][i] - 1)
            # else:
            #     intron_positions['beg'].append(exon_positions['end'][i] + 1)
            #     intron_positions['end'].append(exon_positions['beg'][i-1]-1)
            if verb:
                for i in range(0, intron_count):
                    print('{} - b: {} e: {}'.format(
                        i + 1, intron_positions['beg'][i],
                        intron_positions['end'][i]))

            # return intron_positions # Is this all I want? Won't work with
            #   per transcript loop

            introns = []

            for i in range(0, intron_count):
                # intron = ''
                if strand > 0:
                    intron = seq_record.seq[intron_positions['beg'][i] -
                                            start:intron_positions['end'][i] -
                                            start]
                else:
                    intron = seq_record.seq[intron_positions['beg'][i] -
                                            start:intron_positions['end'][i] -
                                            start]
                    # intron = seq_record.seq[intron_positions['end'][i] -
                    #                         start:intron_positions['beg'][i] -
                    #                         start]
                    intron = intron.reverse_complement()
                introns.append(intron)
            if verb:
                print('The introns of {} are '.format(seq_record.id))
                for x in introns:
                    print(str(x))
            # Gather further info for output

            strand = int(
                re.match('.+gene_chrom_strand="([^"]+)"',
                         seq_record.description).group(1))
            if strand > 0:
                strand_sym = '+'
            else:
                strand_sym = '-'

            # Output
            s = 1
            if species not in don:
                don[species] = []
                acc[species] = []
                dinuc[species] = []
                dinuc_motif[species] = []
            for x in introns:
                # If intron is not anomalous...
                if not (len(x) > max_intron_len or len(x) < min_intron_len):
                    #  Setting up donor and acceptor tables
                    # upper is good???
                    don[species].append(x.upper()[:don_len])
                    acc[species].append(x.upper()[-acc_len:])
                    dinuc[species].extend(dinucs(x))

                beg = intron_positions['beg'][s - 1]
                end = intron_positions['end'][s - 1]
                l = abs(end - beg)
                intron_set = '{}/{}'.format(s, intron_count)
                order = [
                    seq_record.id, species, chrom,
                    str(beg),
                    str(end), strand_sym, intron_set,
                    str(l)
                ]
                order.extend(analyze_intron(x))
                if peptide != '':
                    pep_id = get_pep_id(seq_record.description)
                    order.append(str(len(peptide_dict[pep_id])))
                order.append(str(x))
                o.write('\t'.join(order) + '\n')
                s += 1
            example += 1
            if example > 4 and test:
                break
    # delete output file if not enough_introns?
    o.close()

    for species in don:
        don_motif[species] = motifs.create(don[species])
        acc_motif[species] = motifs.create(acc[species])
        # dinuc_motif[species] = motifs.create(dinuc[species])
        dinuc_dist[species] = {}
        for di in dinuc[species]:
            try:
                dinuc_dist[species][di] += 1
            except KeyError:
                dinuc_dist[species][di] = 1

    with open(intron_file) as out1:
        intron_file_2 = '{}_introns_2.FASTA'.format(fasta[:-6])
        out2 = open(intron_file_2, 'w')
        if peptide != '':
            headline = '# id chr beg end str n/m len gc' +\
                       ' ambig? pep don acc 2mer seq\n'
        else:
            headline = '# id chr beg end str n/m len gc' + \
                       ' ambig? don acc 2mer seq\n'
        out2.write(headline)
        lines = out1.readlines()
        good_ones = 0
        for line in lines[1:]:
            intron = line.split()[-1]
            if len(intron) > max_intron_len or len(intron) < min_intron_len:
                continue
            species = line.split()[1]
            good_ones += 1
            d = score_site(Seq(intron[:don_len], don_motif[species].alphabet),
                           don_motif[species])
            a = score_site(Seq(intron[-acc_len:], acc_motif[species].alphabet),
                           acc_motif[species])
            di_score = score_dinucleotides(intron, dinuc_dist[species])
            order = ('\t'.join(line.split()[:-1]), d, a, di_score, intron)
            out2.write('\t'.join(order) + '\n')
        out2.close()
        if len(lines) == 0:
            print('Requires Python 3 for additional processing')
        else:
            print('Processed {} good introns out of {}'.format(
                good_ones,
                len(lines) - 1))
for i in SeqIO.parse(inputfile, fmt):

    if len(i.seq) > 50000:
        seqlist = list(chunkstring(i.seq, 50000))
        c = -1
        for t in seqlist:
            c = c + 1
            print i.id, c + 1
            f2.write(">" + i.id + "-part-" + str(c + 1) + "\n" + str(t) + "\n")
    else:
        f2.write(">" + i.id + "\n" + str(i.seq) + "\n")

f2.close()

inputfile2 = open('/OSM/HOME-MEL/all29c/scripts/tmp2/inp%s.fa' % (r2), 'r')
count = SeqIO.index('/OSM/HOME-MEL/all29c/scripts/tmp2/inp%s.fa' % (r2), fmt)

c1 = len(count)

if threads > c1:
    chunksize = c1
    threads = c1
else:
    chunksize = int(c1 / threads) + 1
seqs = []

print "threads=", threads
print "input file=", c1
print "Chunksize =", chunksize

for i in SeqIO.parse(inputfile2, fmt):
Exemplo n.º 60
0
def pull_records(emp_fasta, parsing_list):
    """
    Function to write sequences to new fasta file based 
    on the blast coordinates retrieved for those sequences.
    """
    print(
        "\n---------------------------------------------------------------------------\n"
    )
    print("Writing sequences with updated coordinates...\n")
    #load the fasta file using the indexing function
    #will create a dictionary with accn as keys
    record_dict = SeqIO.index(emp_fasta, "fasta")
    #create set of accession numbers from empirical fasta
    emp_accs = set(list(record_dict.keys()))
    #initiate empty set to populate with accession for seqs extracted
    extracted_accs = set()
    #initiate output files
    autoname = "{}_extracted.fasta".format(emp_fasta.split('.')[0])
    logname = "Log_File_{}.txt".format(emp_fasta.split('.')[0])
    badname = "Log_BadSeqs_{}.fasta".format(emp_fasta.split('.')[0])

    with open(logname, 'a') as fh_log:
        fh_log.write(
            "Accn\tOriginal_Length\tRetained_Length\tCoordinates_Used\n")
    with open(autoname, 'a') as fh_out:
        #initiate count for records sliced and written
        count = int(0)
        #iterate over list with accns and coordinates
        for item in parsing_list:
            extracted_accs.add(item[0])
            #test if number of accns processed divisible by 100, if so print number
            #ie an on-screen progress report
            if count != int(0) and count % int(100) == 0:
                print("\t\tFinished writing {:,} updated records...".format(
                    count))
            #look up the sequence using the accn number as the dictionary key
            fullseq = record_dict[item[0]].seq
            seq_parts = []
            #iterate over parsing list where coordinate lists would start (index 1)
            #if only one list, fine, but if multiple we need to examine them to
            #pull out the appropriate parts of the sequence of interest
            for coord in item[1:]:
                #trim sequence if necessary, convert seq object to string
                #which allows for concatenation later (if multiple intervals present)
                seqslice = str(fullseq[coord[0]:(coord[1] + int(1))])
                seq_parts.append(seqslice)
            #if there are multiple sequence sections, join them here
            #if only one item in list will not throw an error
            newseq = "".join(seq_parts)
            #write information to log file
            with open(logname, 'a') as fh_log:
                fh_log.write("{0}\t{1}\t{2}\t{3}\n".format(
                    item[0], len(fullseq), len(newseq), item[1:]))
            #write to updated fasta file
            fh_out.write(">{}\n{}\n".format(record_dict[item[0]].description,
                                            newseq))
            #add to counter
            count += 1

    #write file of sequences that failed
    badseqs = emp_accs - extracted_accs
    if len(badseqs) >= 1:
        with open(badname, 'a') as fh_out:
            for acc in badseqs:
                fh_out.write(">{}\n{}\n".format(record_dict[acc].description,
                                                (record_dict[acc].seq)))

    print("\nWrote a total of {0:,} sequences to {1}.".format(count, autoname))

    if len(badseqs) >= 1:
        print(
            "{0:,} starting sequence(s) did not pass similarity filtering and are written to {1}.\n\n"
            .format(len(badseqs), badname))

    elif len(badseqs) == int(0):
        print("All starting sequences passed similarity filtering!\n\n")