def main(): os.chdir(MOTIF_DIR) fieldnames = ['FAMILY', 'FAMSIZE', 'MOTIF', 'RANK', 'PSCORE', 'TOTAL_RECALL', 'TPS', 'FPS'] out_csv = csv.writer(sys.stdout, fieldnames, delimiter='\t') # NOTE THAT we CAN't USE comma as delim becuz we use then for TP,FP scores! out_csv.writerow(fieldnames) for d in os.listdir(os.path.curdir): fam = dir_info[d]['family'] motif_info = {} # read the .fna.summary.pscore_added file for obj in csv.DictReader(\ open(os.path.join(d, d + '.fna.summary.pscore_added')),\ delimiter=','): motif_name = os.path.basename(obj['motif']) motif_info[motif_name] = obj for file in glob.iglob(d + "/*.cmsearched"): print >> sys.stderr, "reading {0}....".format(file) tp_count = 0 tp = [] fp = [] for id,(score,start,end) in read_tab_cmscanned(file, 'score', 0).iteritems(): if id.startswith('FAKE_'): # is a FP! fp.append( str(score) ) else: tp.append( str(score) ) tp_count += 1 motif_name = os.path.splitext(os.path.basename(file))[0] out_csv.writerow([fam, \ db_summary[fam]['TRUE'], \ motif_name, \ motif_info[motif_name]['Rank.index'],\ motif_info[motif_name]['pscore'],\ tp_count,\ ",".join(tp),\ ",".join(fp)])
def run_infernal(motif, scan_filename, output_prefix): import miscInfernal # run infernal (NOTE THE HARD-CODED stuff!!! should change later) os.system("cmbuild {motif}.cmbuilded {motif}".format(motif=motif)) os.system("cmsearch --fil-T-hmm 10 --toponly -T 10 " + \ "--tabfile {out}.hits -o {out}.cmscanned ".format(out=output_prefix) + \ "{motif}.cmbuilded {scan}".format(motif=motif, scan=scan_filename)) result = miscInfernal.read_tab_cmscanned(tab_filename=output_prefix+'.hits') # for simplicity, remove from result hits that are motif members # currently the IDs are HARDCODED like: >AARF01000040.1/2307-2550(1-244) for id in os.popen("grep \">\" {motif}.fasta".format(motif=motif)).read().strip().split('\n'): try: _id = id[1:id.find('(')] print >> sys.stderr, "removing from cmscan hits {0}....".format(_id) del result[_id] except: pass return result
def evaluate_CMscaned(dir, scan_fasta_pickle, total_ranks_pickle, best_e_or_score, cutoff,\ eval_func=None,\ suffix='.nocalib_hmmT10.cmsearched_hits'): """ <dir> should be a directory containing files of format <motif_filename>.cmsearched_hits which are outputs from Infernal-1.0's cmsearch use --tabfile option. <total_ranks_pickle> can either be from rankpl or pscore. Both will be a list of cluster_steps4.MotifRankInfo objects. """ if eval_func is None: eval_func = lambda id,(best,loc_start,loc_end): id[:id.index('___')] with open(total_ranks_pickle) as handle: total_ranks = load(handle) with open(scan_fasta_pickle) as handle: total_fam_counts = load(handle) # some of the total_fam_counts have keys that are like moco- # we'll put them as moco... for k in total_fam_counts: if k is not None and k.endswith('-'): total_fam_counts[k[:-1]] += total_fam_counts[k] del total_fam_counts[k] best_cmscan_by_family = defaultdict(lambda: {'sens':0., 'prec':0., 'motif':None}) for_scatterplot = {'real':[], 'random':[]} # go through each MotifRankInfo obj, if family is not None, then we did a CM scan for it # refer to cluster_steps_generate_CMscan_cmds.py to see why this is the case for obj in filter(lambda obj: obj.fam is not None, total_ranks): # some hardcoded crap here....*sigh*, right now the motif_filename is stored like: # grid01_02_124.fna.1.motif.h1_2.dup_rmed.html (has .dup_rmed. it's used with rankpl) filename = obj.motif_filename[:obj.motif_filename.rfind('.html')] + suffix if not os.path.exists(os.path.join(dir, filename)): print("ERROR: file {0} doesn't exist".format(filename)) continue # returns scanned_id --> (best_e_or_score, start, end) print >> sys.stderr, "file is {0}, family should be {1}".format(filename, obj.fam) cm_results = miscInfernal.read_tab_cmscanned(os.path.join(dir, filename), best_e_or_score, cutoff) if len(cm_results) == 0: continue tally_by_family = defaultdict(lambda: set()) for id,(best,loc_start,loc_end) in cm_results.iteritems(): # print >> sys.stderr, id, loc_start, loc_end # (_acc,_junk),_strand,_start,_end = parsed_accID(id,version_split=True,loc_start=loc_start,loc_end=loc_end) # print >> sys.stderr, "parsed into", _acc, _start, _end # rb_id,rb_fam = get_ribo1(_acc, _start, _end) # print >> sys.stderr, "rb is", rb_id, rb_fam # raw_input() # if rb_fam is not None and rb_fam.endswith('-'): # rb_fam = rb_fam[:-1] # print >> sys.stderr, "evaling", id, best, loc_start, loc_end rb_fam_with_id = eval_func(id, (best,loc_start,loc_end)) # in format fam___id rb_fam,rb_id = rb_fam_with_id.split('___') tally_by_family[rb_fam].add( rb_id ) if rb_fam == obj.fam: for_scatterplot['real'].append( best) else: for_scatterplot['random'].append( best ) fam = obj.fam for t in tally_by_family: tally_by_family[t] = len(tally_by_family[t]) if fam not in tally_by_family: tally_by_family[fam] = 0 # we need to do this otherwise next two lines with fail # calculate sensitivity and precision(PPV) sens = tally_by_family[fam]*1./total_fam_counts[fam] prec = tally_by_family[fam]*1./len(cm_results) # print >> sys.stderr, "motif:", os.path.join(dir,filename), "fam: ", fam, "sens: ", sens, "prec: " , prec if sens > best_cmscan_by_family[fam]['sens'] or \ sens==best_cmscan_by_family[fam]['sens'] and prec > best_cmscan_by_family[fam]['prec']: best_cmscan_by_family[fam] = {'sens': sens, 'prec': prec, 'motif': filename} fams = best_cmscan_by_family.keys() fams.sort(key=str.lower) # print the final results in latex table format for fam in fams: print("{fam} & {count} & {sens:.2f} & {prec:.2f} & {motif}\\\\hline".format(fam=fam, \ count=total_fam_counts[fam], sens=best_cmscan_by_family[fam]['sens'], \ prec=best_cmscan_by_family[fam]['prec'],\ motif=best_cmscan_by_family[fam]['motif'])) # print for_scatterplot print(",".join(map(str, for_scatterplot['real'] ) ) ) print(",".join(map(str, for_scatterplot['random'] ) ))