def init(): def get_scaf2seqid(): #return secf2seqid dict #needs ygap_corr file scaf2seqid = {} f = open(ygap_corr) for line in f: scaf,seqid,chrname = line.split("\t") scaf = scaf.split("_")[-1] scaf2seqid[scaf] = seqid return scaf2seqid #generate blast compare file scaf2seqid = get_scaf2seqid() if REF: refs = [] dubious = [] for record in gff_parse.gffIterator(Ref_anno): if record.type == "gene" and record.attributes["orf_classification"][0] == "Dubious": dubious.append(record.attributes["ID"][0]) elif record.type == "gene" and not record.attributes["ID"][0].startswith("Q"): seqid = record.seqid start = int(record.start) end = int(record.end) record.id = record.attributes["ID"][0] if record.strand == "+": record.coord = "(%d..%d)"%(start,end) record.seq = Ref_seq[seqid][start-1:end] elif record.strand == "-": record.coord = "complement(%d..%d)"%(start,end) record.seq = Ref_seq[seqid][start-1:end].reverse_complement() else: print "Check strand:%s"%record.type refs.append(record) ygap = [] for record in YGAPIterator(ygap_anno): if record.type == "PROTEIN" or record.type == "": seqid = int(record.seqid) start = int(record.start) end = int(record.end) if record.strand == "+": record.seq = ygap_seq[seqid][start-1:end] elif record.strand == "-": record.seq = ygap_seq[seqid][start-1:end].reverse_complement() else: print "Check strand:%s"%record.type ygap.append(record) devin = [] for record in gff_parse.gffIterator(devin_gff): if record.type == "CDS" or record.type == "ORF": if REF: #exclude dubious genes flag = False for h**o in record.attributes["SGD"]: if h**o in dubious: flag = True if flag: print "find %s is %s "%(record.attributes["Gene"],h**o) continue seqid = int(record.seqid) record.seqid = scaf2seqid[record.seqid] start = int(record.start) end = int(record.end) record.orth = "%s_%s"%(record.score,"|".join(record.attributes["SGD"])) if record.strand == "+": record.coord = "(%d..%d)"%(start,end) record.seq = devin_seq[seqid][start-1:end] elif record.strand == "-": record.coord = "complement(%d..%d)"%(start,end) record.seq = devin_seq[seqid][start-1:end].reverse_complement() else: print "Check strand:%s"%record.type devin.append(record) if REF: write_fsa(ygap,devin,refs) else: write_fsa(ygap,devin) run_blast() return 0
if line.startswith("#"): continue if not line: return r= record() (r.name,r.strand,r.start,r.end,r.ygob,r.seqid,r.shortname,\ r.coord,r.orth,r.type,r.pillar,r.tag,r.anno) = line.split("\t") r.strand = "+" if r.strand == "1" else "-" yield r class list(list): def __init__(self): self.name = "" deven = list() deven.name = "deven" for record in gff_parse.gffIterator(deven_gff): deven.append(record) augustus = list() augustus.name = "augustus" for record in gff_parse.gffIterator(augustus_gff): record.seqid = record.seqid[5:] augustus.append(record) ygap = list() ygap.name = "ygap" for record in YGAPIterator(ygap_anno): ygap.append(record) def compare_gff(template,*gffs): temp = [read.seqid+"_"+read.start+"_"+read.end+"_"+read.strand for \
out_put_file = "/Users/bingwang/zen/yeast_anno_pipe/%sONscer.txt"%(sp) #Prepare: numid2scaf dict numid2scaf = {} scaffold_list = [] f = open(correspondances_file) for line in f: scaffold,numid,alpheid = line.strip().split("\t") numid2scaf[numid] = scaffold scaffold_list.append(scaffold) #Prepare: scer_genes synteny list #Prepare: scer2record dict scer_genes = [] scer2record = {} for record in gff_parse.gffIterator(open(ref_file)): if record.type == "gene" and record.attributes["orf_classification"][0] == "Dubious": #dubious.append(record.attributes["ID"][0]) continue elif record.type == "gene" and not record.attributes["ID"][0].startswith("Q"): seqid = record.seqid start = int(record.start) end = int(record.end) record.id = record.attributes["ID"][0] scer_genes.append(record.id) scer2record[record.id] = record #Prepare: scer2seub and seub2scer dicts #Prepare: seub2record dict scer2seub = {} seub2scer = {}