Exemplo n.º 1
0
    def setUp(self):
        logger.info("****set up****")
        self.sp9_genome = dic2dic(fasta2dic("./test/sp9pseudo.fa"))
        self.cb4_genome = dic2dic(fasta2dic("./test/cb4.fa"))
        self.name, self.seq = chr_select(self.cb4_genome, "X", 19424476,
                                         19655212)

        self.seqdic = {self.name: self.seq}
Exemplo n.º 2
0
def pre_fsa(fastafile, spe_name=None, out=None):
    """
    The species name should be provided in the running or
    just rename the fastafile to contain the species name

    prepare the fsa file for the tbl2asn prog
    the fsa file should:
        - have the header line contains the spe information
        - end with ".fsa"
    :param fastafile:
    :param spe_name:
    :param out:
    :return:
    """
    fa_d=fasta2dic(fastafile)
    if spe_name is None:
        spe_name= fastafile.split(".")[0]
    if out is None:
        out = spe_name + ".fsa"
    fastaname=spe_name.replace(" ", "_") # in case the species name contains space

    # give a mtDNA header line for the fsa file, ready for the submission
    header=">{fastaname} [organism={spe_name}] [chromosome=mt] [moltype=genomic DNA] " \
           "[gcode=5] [Topology=Circular] [Completedness=Complete] " \
           "{spe_name} mitochondrion, complete genome.".format(
        fastaname=fastaname,spe_name=spe_name)

    with open(out, "w") as fw:
        fw.write(header)
        fw.write("\n")
        fw.write(str(fa_d.values()[0].seq))
        fw.write("\n")

    return out
Exemplo n.º 3
0
def get_tnra_pro(fastafile, mitfi_out):
    """
    re_order the mtDNA fastafile, write a new fastefile start with trna P (Phe)

    :param fastafile:
    :param MITFIPATH:
    :return:
    """
    fasta_d=fasta2dic(fastafile)
    if len(fasta_d)!=1:
        print("Check the fasta file and make it continues one!")
        return None

    ###### get the start point of trna_Pro and the strand
    trna_pro_start=None
    trna_pro_socre=0
    strand="+"
    fr=open(mitfi_out,"r")
    for line in fr.readlines():
        terms=line.strip().split("\t")
        header, start, stop, score, evalue, AC, AA, model, strand=terms
        if AA=="P" and float(score)>trna_pro_socre and float(evalue)<=0.001:
            if strand=="+":
                trna_pro_start=int(start)
            elif strand=="-":
                trna_pro_start=int(stop)
            trna_pro_socre=score
    fr.close()

    return (trna_pro_start, strand)
Exemplo n.º 4
0
def exonerate_wrapper(query, target, outfile=False, geneticcode=5, score=100, bestn=None):
    """
    --geneticcode 5

    return is a outfile name in relative path
    todo: using stringIO to hinder the file IO
    """
    if bestn is None:
        bestn=len(fasta2dic(target)) # default, output one region for one query

    exonerate_cmd="exonerate {query} {target} \
                   --geneticcode {geneticcode} \
                   --score {score} \
                   --bestn {bestn} \
                   ".format(
                        query=query, target=target,
                        geneticcode=geneticcode,
                        score=score,
                        bestn=bestn,
                        )
    out=myexe(exonerate_cmd)

    ## trigger to write the outfile to disk
    if outfile:
        outname=query.split("/")[-1].split(".")[0]+".exonerate"
        with open(outname, "w") as fw:
            fw.write(outname)

    return out
Exemplo n.º 5
0
def get_cogfile(fastafile, wkdir=None, out="m20.txt"):
    """
    cat all the cds/or protein together, and get the cog file used for ete alignment
    :param fastafile: the comibined fasta file
    :param out: the name indicaiting the orthologs, like: "ppac#ND4\tcele#ND4\nppac#ND5\tcele#ND5\n"
    :return:
    """
    if wkdir is None:
        wkdir=os.getcwd()

    os.chdir(wkdir)
    fa_d=fasta2dic(fastafile)
    fw=open(out, "w")
    name_d={}
    for k in fa_d.keys():
        suffix=k.split("#")[1]
        try:
            name_d[suffix].append(k)
        except KeyError:
            name_d[suffix]=[]
            name_d[suffix].append(k)
    for k, v in name_d.iteritems():
        fw.write("\t".join(v))
        fw.write("\n")

    return out
Exemplo n.º 6
0
def scaf_filter(filename, cutoff_length=None, cutoff_coverage=None, len_cutoff=13000):
    """
    get the attributes from the scaf/contig name
    :return:
    """
    fa_dict=fasta2dic(filename)
    fa_dict_f={}

    # cacl the cutoff if not given
    length_l=[]
    coverage_l=[]
    total_len=0
    for name, seq in fa_dict.iteritems():
        name_p=scaf_name_parse(name)
        length_l.append(name_p["length"])
        coverage_l.append(name_p["coverage"])
    if cutoff_length is None:
        cutoff_length=max(length_l)/10
    if cutoff_coverage is None:
        cutoff_coverage=max(coverage_l)/20

    # do the filter
    for name, seq in fa_dict.iteritems():
        name_p=scaf_name_parse(name)
        if name_p["length"]>cutoff_length and name_p["coverage"]>cutoff_coverage\
                and total_len<len_cutoff:
            fa_dict_f[name]=seq
            total_len+=len(seq)

    return fa_dict_f
Exemplo n.º 7
0
    def test_bindseq(self):
        ref_dict = fasta2dic("/home/zhaolab1/reference/ce10.fa")
        bigg_one = self.bigg[-1]

        bigg_one.bind_chroseq(ref_dict, gap=100, intron=True)
        print bigg_one.seq_chro
        print bigg_one
Exemplo n.º 8
0
    def __test_orfs(self):
        """
        can only run with ce10 ref
        """
        ref_dict = fasta2dic("/home/zhaolab1/reference/ce10.fa")
        bigg_one = self.bigg[30]
        bigg_one.bind_chroseq(ref_dict, gap=0, intron=False)
        print bigg_one.seq_chro
        ans = bigg_one.find_orfs_with_trans()

        print ans
        print bigg_one
Exemplo n.º 9
0
def _ssw_ref(read_path, ref_path="adaptor.fasta"):
    '''
    The orign method to find the adaptor position, slow and buggy
    :param read_path:
    :param ref_path:
    :return:
    '''
    ref_dict=fasta2dic(ref_path)
    fqs_in=SeqIO.parse(open(read_path),'fastq')
    fqs_out=open(read_path.replace(".fq","").replace(".fastq","")+"trimmed"+".fastq","w")
    i=0
    for fq in fqs_in:
        for ref_name, ref_read in ref_dict.iteritems():
            len_cutoff=len(ref_read)-3
            aligner=Aligner(ref_read,report_cigar=True)
            aln=aligner.align(fq, min_score=len_cutoff*2-10, min_len=len_cutoff)
            if aln !=None:
                print ref_name, aln.score, aln.ref_begin,aln.ref_end,aln.query_begin,aln.query_end,aln.cigar_string
    fqs_out.close()
Exemplo n.º 10
0
def re_order(fastafile,newstart,strand="+",outfasta=None):
    """
    :param fastafile: the sequence which only have one sequence
    :param newstart: new start point for the fasta sequence, 1 based
    :param strand": "+" or "-"
    :param outfasta:
    :return:
    """
    fasta_d=fasta2dic(fastafile)

    ######
    if len(fasta_d)!=1:
        print("Check the fasta file and make it continues one!")
        return None
    chro=fasta_d.keys()[0]
    seq=fasta_d.values()[0]
    ######

    ###### re-order the new file
    if outfasta is None:
        prefix=fastafile.split("/")[-1].split(".")[0]
        outfasta=prefix+"_ordered.fasta"
    ########

    if strand=="+":
        pass
    if strand=="-":
        newstart=len(seq)-newstart+1
        seq=seq.reverse_complement()
        fasta_d={chro:seq}
        print newstart

    frg_1=chr_select(record_dict=fasta_d, chro=chro, start=newstart-1, end=len(seq))[1]
    frg_2=chr_select(record_dict=fasta_d, chro=chro, start=0, end=newstart-1)[1]
    seq_new="".join([frg_1, frg_2])

    with open(outfasta, "w") as fw:
        fw.write(">"+chro+"_re"+"\n")
        fw.write(seq_new)
        fw.write("\n")
    return outfasta
Exemplo n.º 11
0
def pre_fsa(fastafile, spe_name, out=None):
    """
    pre the .fsa file for sqn submission
    :param fastafile:
    :return:
    """
    fa_d = fasta2dic(fastafile)
    if out is None:
        prefix = fastafile.split(".")[0]
        out = prefix + ".fsa"
    fastaname = spe_name.replace(" ", "_")

    header=">{fastaname} [organism={spe_name}] [chromosome=mt] [moltype=genomic DNA] " \
           "[gcode=5] [Topology=Circular] [Completedness=Complete] " \
           "{spe_name} mitochondrion, complete genome.".format(
        fastaname=fastaname,spe_name=spe_name)
    with open(out, "w") as fw:
        fw.write(header)
        fw.write("\n")
        fw.write(str(fa_d.values()[0].seq))
        fw.write("\n")

    return out
Exemplo n.º 12
0
def exonerate_parser_write(query, exonerate_file, prefix=None):
    """
    parser the exonerate result, and return the protein and cds file
    modification: add the position to the name of the cds and pro, use space to add interval
    :param query:
    :param exonerate_file:
    :param prefix:
    :return:
    """
    ref_dict=fasta2dic(query)

    if prefix is None:
        prefix=query.split(".")[0]

    p_outname=(prefix+"_exonerate_p.fa")
    cds_outname=(prefix + "_exonerate_cds.fa")

    fw_p=open(p_outname, "w")
    fw_cds=open(cds_outname, "w")

    texts=SearchIO.parse(StringIO(exonerate_file), format="exonerate-text")
    for record in texts:
        for hsp in record:
            for s in hsp:
                #print(s.fragment.hit_id)
                name_str=">"+s.fragment.hit_id
                name_cds, cds_str=chr_select(ref_dict, s.fragment.query_id, s.fragment.query_start,
                                 s.fragment.query_end)
                p_str=str(s.fragment.query.seq)

                #print(name_cds)

                fw_p.write(name_str+"  "+name_cds+"\n"+p_str+"\n")
                fw_cds.write(name_str+"  "+name_cds+"\n"+cds_str+"\n")

    return cds_outname, p_outname
Exemplo n.º 13
0
 def setUp(self):
     logger.info("****set up %s ****" % self.__name__)
     self.sp9_genome = dic2dic(fasta2dic("./test/sp9pseudo.fa"))
     self.cb4_genome = dic2dic(fasta2dic("./test/cb4.fa"))
     self.name, self.seq = chr_select(self.sp9_genome, "cniII", 700000,
                                      740000)
Exemplo n.º 14
0
    """
    d={}
    file_open=open(filename)
    lines=file_open.readlines()
    for line in lines:
        # store the name in the format of key:value
        # {"I:130^131":"AAATTTTCCC"}
        insertion=line.split("\t")[0]+":"+line.split("\t")[1]+"^"+line.split("\t")[2]
        sequence=line.split("\t")[3].strip()
        d[insertion]=sequence
    file_open.close()
    return d

# usage todo: need to be modified to be main function
insertion=read_insertion("insertion_filled.txt")
record_dict=fasta2dic("cb4_nfilled.fasta")


def write_insertion(outfile="inserted.fasta"):
    f=open(outfile)
    for name in record_dict.keys():
        subinsertion={}
        print name
        for key in insertion.keys():
            if name==key.split(":")[0]:
                subinsertion[key]=insertion[key]
        seq_old=str(record_dict[name].seq)
        seq_new=[]
        for i in range(0,len(seq_old)):
            for key in subinsertion.keys():
                if i==(int(key.split("^")[1])-2):
Exemplo n.º 15
0
                left.append(i)
            # the right edge can be the end of the chr
            try:
                if nucl=="n" and seq[(i+1)]!="n":
                    right.append(i)
            except Exception:
                pass
            if i==len(seq) and nucl=="n":
                right.append(i)
        # check if the left edge and right edge are paired. Normally it should be OK.
        if len(left)==len(right):
            count=0
            length_N=0
            for i in range(0,len(left)):
                N_single=(chr,left[i],right[i])
                N_list.append(N_single)
                # print some stat out
                length_N+=right[i]-left[i]+1
                if right[i]-left[i]>=N_threshold:
                    count+=1
            print "%s has %d gaps (len> %d bp), total gap length is %d (single N included)." %(chr,count,N_threshold,length_N)
        else:
            print "left", len(left), "unequal to", "right", len(right)
    # store the N_list as pickle file:
    with open("./4st_split_segment/N_list.dat", "wb") as fp:
        pickle.dump(N_list, fp)
    return N_list

if __name__ == '__main__':
    record_dict=fasta2dic("./4st_split_segment/cb4_insertion_filled.fasta")
    aa=summary_N(record_dict)