def parseCMF(fout): try: with open(RES_DIR + "output.txt", "r") as CMFResults: foundMotifs["CMF"] = [] foundMotifsSeqs["CMF"] = DD(lambda: DD(list)) alreadyFound = {} readmode = 0 # 0=looking for motif,1=reading pos,2=looking for pos for line in CMFResults: if readmode == 2: readmode = 1 continue if readmode == 1: if len(line) < 2: readmode = 0 continue seqName, mpos = line.split("\t")[:2] seqName = seqName.strip() #pdb.set_trace() foundMotifs["CMF"][-1][1].append(SEQ_LENS[seqName[1:]] - int(mpos)) foundMotifsSeqs["CMF"][motif][seqName[1:]].append(int(mpos)) fout.write("\t"+str(SEQ_LENS[seqName[1:]] - int(mpos))) readmode = 2 continue if line[0:7] == "MOTIF:\t" and not line[7:] in alreadyFound: motif = line[7:-1] fout.write("\nCMF\t"+line[7:-1]) foundMotifs["CMF"] += [[line[7:-1], []]] #foundMotifsSeqs["CMF"][motif] = DD(list) #alreadyFound[line[7:]] = 0 elif "Positive Sites:" in line: readmode = 1 except IOError: print "Error opening CMF's results file"
def __init__(self, getIndex=0, viewIndex=-1): self.getIndex, self.viewIndex = getIndex, viewIndex #TODO: Add sanity checks? self.key2mon = DD(lambda:DD(set)) # Maps a normal key to keys that monitor it. self.monkeys = set() # List of monitor keys. # store and waiters are mutually exclusive, and could be kept in the same place self.store = DD(list) self.waiters = DD(list) self.opCounts = {b'get': 0, b'put': 0, b'view': 0, b'wait': 0} self.ac, self.rc = 0, 0
async def get_fkeys(company, table_name): with await fkey_lock: if company not in fkeys: src_fkeys = DD(list) tgt_fkeys = DD(list) src_fkey = NT('src_fkey', 'src_col, tgt_tbl, tgt_col, alt_src, alt_tgt, test') tgt_fkey = NT('tgt_fkey', 'src_tbl, src_col, tgt_col, is_child, test') sql = ("SELECT b.table_name, a.col_name, a.fkey " f"FROM {company}.db_columns a, {company}.db_tables b " "WHERE b.row_id = a.table_id " "AND a.deleted_id = 0 " "AND a.fkey IS NOT NULL") async with db_session.get_connection() as db_mem_conn: conn = db_mem_conn.db cur = await conn.exec_sql(sql) async for src_tbl, src_col, fkey in cur: tgt_tbl, tgt_col, alt_src, alt_tgt, is_child, cursor = loads( fkey) if isinstance(tgt_tbl, str): # normal case test = None src_fkeys[src_tbl].append( src_fkey(src_col, tgt_tbl, tgt_col, alt_src, alt_tgt, test)) tgt_fkeys[tgt_tbl].append( tgt_fkey(src_tbl, src_col, tgt_col, is_child, test)) else: col_name, vals_tables = tgt_tbl for val, tgt_tbl in vals_tables: test = (col_name, val) src_fkeys[src_tbl].append( src_fkey(src_col, tgt_tbl, tgt_col, alt_src, alt_tgt, test)) tgt_fkeys[tgt_tbl].append( tgt_fkey(src_tbl, src_col, tgt_col, is_child, test)) fkeys[company] = src_fkeys, tgt_fkeys comp_fkeys = fkeys[company] src_fkeys = comp_fkeys[0][table_name] # returns [] if not found tgt_fkeys = comp_fkeys[1][table_name] # returns [] if not found return src_fkeys, tgt_fkeys
def parseBioProspector(fout): # read sequence mappings mappings = {} mfname = FILTERED_SEQ.split('/')[-1] with open("bp_files/bp_" + mfname + ".mappings", "r") as mapIn: for line in mapIn: line = line.split(">>") mappings[line[0]] = line[1].strip() with open(RES_DIR + "bp_output.txt", "r") as bpResults: foundMotifs["BioProspector"] = [] foundMotifsSeqs["BioProspector"] = {} for line in bpResults: if "Motif #" in line: motif = line[line.index("(")+1:line.index("/")] fout.write("\nBioProspector\t"+motif) foundMotifs["BioProspector"] += [[motif, []]] foundMotifsSeqs["BioProspector"][motif] = DD(list) elif ">seq" in line: slen = int(line.split()[2]) mpos = int(line.split()[-1]) if line.split()[-2] == 'r': mpos -= len(motif) foundMotifs["BioProspector"][-1][1].append(slen - mpos) foundMotifsSeqs["BioProspector"][motif][mappings[line.split()[0][1:]]].append(mpos) fout.write("\t" + str(slen - mpos))
def getSMData(SMDir, targetNode, start, stop): #print ("getSMdata %s"%targetNode) sm = SearchIndex(SMDir+'/%s_sm.px'%targetNode, 40, compTimestamps) smd = IndexedHostData(SMDir, targetNode) usr2d = DD(list) pos = sm.find('%020d'%start) #print (pos) for x in range(pos, sm.len): offset = int(sm[x][20:]) ts, nd = smd.readData (offset, stop) if ( nd == None): break #print("nd=" + repr(nd)) for usrdata in nd[3:]: # username, userdata usr2d[usrdata[0]].append([ts] + list(usrdata[1:7])) lseries, mseries = [], [] for usrname in sorted(usr2d.keys()): l, m = [], [] for e in usr2d[usrname]: ts = e[0]*1000 l.append([ts, e[4]]) m.append([ts, e[6]]) lseries.append({'data': l, 'name': usrname}) mseries.append({'data': m, 'name': usrname}) #[{'name':username, 'data':[[timestamp, value]...]} ...] return lseries, mseries
def voteRank(sequences, motifs): poll = {} for seq in sequences: poll[seq] = [0.0] * len(sequences[seq]) # perform poll for tool in motifs: for motif in motifs[tool]: for seq in motifs[tool][motif]: sequence = best(sequences, seq) for pos in motifs[tool][motif][seq]: for i in xrange(pos, pos + len(motif)): try: # instead of weighting all results the same (1), we # could bias based on tool or number of results or something like that poll[sequence][i - 1] += 1 except Exception as e: print e print 'It appears a tool has reported finding a motif',\ 'outside the bounds of a sequence' print 'such as finding a motif of length 10 at position',\ '195 in a sequence with length 200' pdb.set_trace() # add up votes for each motif ress = DD(int) for tool in motifs: for motif in motifs[tool]: for seq in motifs[tool][motif]: for pos in motifs[tool][motif][seq]: for p in xrange(pos, pos + len(motif)): ress[motif] += poll[best(sequences, seq)][p-1] # sort motifs by number of votes return sorted(map(lambda a: list(a[::-1]), ress.iteritems()))
def parseMEME(fout): try: with open(RES_DIR + "/meme/meme.txt", "r") as memeResults: foundMotifs["MEME"] = [] foundMotifsSeqs["MEME"] = {} readMode = 0 # 0 is looking for next motif, 1 is looking for pos's, 2 is reading pos's for line in memeResults: if readMode == 1: readMode = 2 continue if readMode == 2: if "----------------------" in line: readMode = 0 continue seqName, sPos = line.split()[0:2] seqName = seqName[0:19].strip() foundMotifs["MEME"][-1][1].append(SEQ_LENS[seqName] - int(sPos)) foundMotifsSeqs["MEME"][motif][seqName].append(int(sPos)) fout.write("\t" + str(SEQ_LENS[seqName] - int(sPos))) if "Multilevel" in line: motif = line.strip().replace("Multilevel","").replace(" ","") fout.write("\nMEME\t"+motif) foundMotifs["MEME"] += [[motif, []]] foundMotifsSeqs["MEME"][motif] = DD(list) elif "Sequence name" in line and "Start" in line: readMode = 1 except IOError: print "Error opening MEME's results file"
def parseXXmotif(fout): try: foundMotifs['XXmotif'] = [] foundMotifsSeqs['XXmotif'] = {} reportedMotifs = [None] lets = 'ACGT' baseFname = RES_DIR + '.'.join( FILTERED_SEQ.split('/')[-1].split('.')[:-1]) #baseFname = 'XXmotif/results/hsap_core_promoters_all' # parse motifs #pdb.set_trace() with open(baseFname + '.pwm') as XXmotifResults: cm = [] for i, line in enumerate(XXmotifResults): if i % 6 == 0 or i % 6 == 5: if len(cm) == 4: motif = ''.join([ lets[p.index(max(p))] for p in zip(cm[0], cm[1], cm[2], cm[3]) ]) foundMotifs['XXmotif'].append([motif, []]) foundMotifsSeqs['XXmotif'][motif] = DD(list) cm = [] else: cm.append(map(float, line.split())) seqs = [] # parse sequence mappings with open(baseFname + '_sequence.txt') as XXmotifSequences: for i in xrange(4): XXmotifSequences.next() for line in XXmotifSequences: seqs.append(line.split('\t')[-1].strip()) # parse instance locations with open(baseFname + '_Pvals.txt') as XXmotifPoss: for i in xrange(4): XXmotifPoss.next() mnum = 0 for line in XXmotifPoss: if len(line) < 4: continue if line[:6] == 'Motif ': mnum = int(line.split()[1][:-1]) - 1 else: pos = int(line.split('\t')[4]) seq = seqs[int(line.split('\t')[3]) - 1] foundMotifs['XXmotif'][mnum][1].append(pos) foundMotifsSeqs['XXmotif'][foundMotifs['XXmotif'][mnum] [0]][seq].append(pos) for motif, poss in foundMotifs['XXmotif']: fout.write('\nXXmotif\t' + motif) for pos in poss: fout.write('\t' + str(pos)) except IOError as e: print e, e.filename print "Error opening XXmotif's result file"
def reset_log(): # TODO: make it such that if key is not in dict, it's init with incoming content # this way, we won't have to know in advance what we want to monitor return DD(list) logs = OD() for name in [ 'inner log p(x|z)', 'log p(x|z)', 'log p(x|z) nn', 'commit', 'vq', 'kl', 'bpd', 'elbo' ]: logs[name] = [] return logs
def parseDECOD(fout): try: with open(RES_DIR + "decod_found_motifs.txt", "r") as _decodResults: foundMotifs["DECOD"] = [] foundMotifsSeqs["DECOD"] = {} alreadyFound = {} decodResults = _decodResults.readlines() lineNum = 0 chars = ["A", "C", "G", "T"] for motif_num in xrange(int(config["DECOD"]["-nmotif"])): PWM = {} # parse PWM for line in decodResults[lineNum:]: lineNum+=1 if len(line) > 1 and line[0] in chars: PWM[line[0]] = line.strip("ACGT []\n").split() if line[0] == "T": break # construct motif from PWM motif = "" for i in xrange(len(PWM['A'])): col = [PWM[x][i] for x in chars] motif += chars[col.index(max(col))] foundMotifsSeqs["DECOD"][motif] = DD(list) # parse instances if not motif in alreadyFound: fout.write("\nDECOD\t"+motif) # find beginning of instances while decodResults[lineNum][0] != ">": lineNum+=1 # save instances positions = [] while decodResults[lineNum][0] == ">": line = decodResults[lineNum] lineNum+=1 if "|revcom" in line: continue seqName = line[1:line.find("\t")] seqName = seqName.strip() pos = int(line.split("\t")[1]) foundMotifsSeqs["DECOD"][motif][seqName].append(pos) pos = SEQ_LENS[seqName] - pos positions.append(pos) fout.write("\t" + str(pos)) foundMotifs["DECOD"] += [[motif, positions]] alreadyFound[motif] = 0 except IOError: print "Error opening DECOD's results file"
def Find_Matches(pattern, Suffa, Words): PS = pattern.split() #length = len(PS) length = len(pattern) index = [] Returner = DD(list) ceiling_cost = (0.3 * len(pattern)) for i in range(0, length): for j in range(length, i, -1): remain = length - j + 1 current = pattern[i:j + 1] index = FIND_IN_SUFFIX_ARRAY(Suffa, Words, pattern[i:j + 1]) Matches = [Match] if (len(index) > 1): for m in index: if m.segment_id != -1: pbfor = i m.leftmin = abs(m.start - pbfor) if m.leftmin == 0 and i > 0: m.leftmin = 1 paftr = len(pattern[j + 1:]) m.rightmin = abs(m.remain - paftr) #m.rightmin = abs(abs(m.remain) - abs(paftr)) if m.rightmin == 0 and remain > 0: m.rightmin = 1 min_cost = m.leftmin + m.rightmin m.pstart = i m.pend = j + 1 m.leftmax = max(m.start, pbfor) m.rightmax = max(m.remain, paftr) if (min_cost <= ceiling_cost + 1): #index.remove(m) Returner[pattern[i:j + 1]].append(m) break #if(len(index) > 1): # print(pattern[i:j+1],len(index)-1,file = filep )#,file = open("test.txt","a")) #for v in index: # Returner[pattern[i:j+1]].append(v) #break return Returner
def Filter_N_Gram(Matches): Copy = DD(list) Last = -1 for all in Matches.keys(): LS = Matches[all] LASTIN = LS[0].pstart if LASTIN != Last: for val in LS: Copy[all].append(val) Last = LASTIN return Copy
def CreateSuffixArrayDD(text): Suffix = DD(list) Words = text.split(sep=' ') Suff_Arr = [] count = 0 for i in Words: if i != '': Suffix[i[0]].append(count) count += 1 SuffA = [] for i in sorted(Suffix): SuffA.extend(SortAlpha(Suffix[i], Words)) return (SuffA, Words)
def Find_Matches_NO_FILTER(pattern, Suffa, Words): PS = pattern.split() #length = len(PS) length = len(pattern) index = [] Returner = DD(list) ceiling_cost = (0.3 * len(pattern)) for i in range(0, length): for j in range(length, i, -1): remain = length - j + 1 current = pattern[i:j + 1] index = FIND_IN_SUFFIX_ARRAY_MY_METHOD(Suffa, Words, pattern[i:j + 1]) Matches = [Match] if (len(index) > 1): for m in index: Returner[pattern[i:j + 1]].append(m) return Returner
def conll_evaluate(l0_inputs, alphas, conll_eval_path, all_top_antecedent_scores): print("Compiling clusters and evaluators for conll suite") coref_predictions = [{} for _ in alphas] coref_evaluators = [metrics.CorefEvaluator() for _ in alphas] subtoken_maps = {} with open(l0_inputs, "rb") as f: data_dicts = np.load(f, allow_pickle=True).item().get("data_dicts") for example_num, data_dict in enumerate(tqdm(data_dicts)): example = data_dict["example"] subtoken_maps[example["doc_key"]] = example["subtoken_map"] top_span_starts = data_dict["top_span_starts"] top_span_ends = data_dict["top_span_ends"] top_antecedents = data_dict["top_antecedents"] for i in range(len(alphas)): top_antecedent_scores = all_top_antecedent_scores[example["doc_key"]][i] predicted_antecedents = get_predicted_antecedents(top_antecedents, top_antecedent_scores) coref_predictions[i][example["doc_key"]] = evaluate_coref(top_span_starts, top_span_ends, predicted_antecedents, example["clusters"], coref_evaluators[i]) summary_dict = DD(list) for i in range(len(alphas)): print("\n*****************************") print("******* alpha = %f *******" % alphas[i]) summary_dict["alpha"].append(alphas[i]) conll_results = conll.evaluate_conll(conll_eval_path, coref_predictions[i], subtoken_maps, official_stdout=True) average_f1 = sum(results["f"] for results in conll_results.values()) / len(conll_results) summary_dict["Average F1 (conll)"].append(average_f1) print("Average F1 (conll): {:.2f}%".format(average_f1)) p,r,f = coref_evaluators[i].get_prf() summary_dict["Average F1 (py)"].append(f) print("Average F1 (py): {:.2f}% on {} docs".format(f * 100, len(subtoken_maps.keys()))) summary_dict["Average precision (py)"].append(p) print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["Average recall (py)"].append(r) print("Average recall (py): {:.2f}%".format(r * 100)) return summary_dict
def parseWeeder(fout): try: with open(RES_DIR + get_filename(FILTERED_SEQ) + ".w2", "r") as _weederResults: foundMotifs["Weeder"] = [] foundMotifsSeqs["Weeder"] = {} weederResults = _weederResults.readlines()[6:] for line in weederResults: if "Matrix" in line: result = line.split() motif = result[2] fout.write("\nweeder\t"+result[2]) foundMotifs["Weeder"] += [[result[2], []]] foundMotifsSeqs["Weeder"][motif] = DD(list) elif line[0] == ">": seqName, mpos = line.split('\t')[0::4] seqName = seqName.strip() foundMotifs["Weeder"][-1][1].append(SEQ_LENS[seqName[1:]] - int(mpos)) foundMotifsSeqs["Weeder"][motif][seqName[1:]].append(int(mpos)) fout.write("\t"+str(SEQ_LENS[seqName[1:]] - int(mpos))) except IOError: print "Error opening Weeder's results file"
def Find_All_Matches(Suffa,Words,pattern): length = len(pattern) Matches = DD(list) ceiling_cost = (0.3 * len(pattern)) for i in range(0,length): for j in range(length,i,-1) : search = pattern[i:j+1] ind = FIND_IN_SUFFIX_ARRAY(Suffa,Words,search) start = i end = j+1 remain = len(pattern[j+1:]) for M in ind: if M.segment_id == -1: break #min_cost = MatchFiltering(start,end,remain,MS) M.leftmin = abs(M.start - start) if M.leftmin == 0 and start > 0: M.leftmin = 1 M.rightmin = abs(M.remain - remain) if M.rightmin == 0 and remain > 0: M.rightmin = 1 min_cost = M.leftmin + M.rightmin if min_cost <= ceiling_cost: M.leftmax = max(M.start,start) M.rightmax = max(M.remain,remain) M.pstart = start M.pend = end Matches[(pattern[i:j+1])].append(M) return Matches
def main(): logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser( description='''Script to find out what words BERT\'s attention attends to in layers 0, 9, 10, 11.''') parser.add_argument('-i', '--src', type=argparse.FileType('r'), metavar='PATH', help='''File containing multiple lines of input text, if not specified, uses some predefined text.''') parser.add_argument('-t', '--top', type=int, default=10, metavar='K', help='Find top K words that BERT attends to. Default 10.') parser.add_argument('-b', '--batch_size', type=int, default=20, metavar='B', help='Specify batch size=B. Default 20.') parser.add_argument('-g', '--gpu', action='store_true', help='Option to use GPU.') parser.add_argument('-a', '--all_layers', action='store_true', help='Output the attention of each layer') parser.add_argument('-T', '--out_top', type=int, metavar='N', help='Output top N words in final output. If -a is set, default value is 100') parser.add_argument('-m', '--mask', action='store_true', help='Mask attended words and compare predictions with original words. Not functional yet.') parser.add_argument('-o', '--out', type=argparse.FileType('w'), help='File to write results', required=True) args = parser.parse_args() top_k = args.top batch_size = args.batch_size use_gpu = args.gpu do_mask = args.mask all_layers = args.all_layers out_top_k = args.out_top print('all_layers', all_layers) if args.src is not None: src = args.src else: text = 'burma has put five cities on a security alert after religious unrest involving buddhists and moslems in the northern city of mandalay , an informed source said wednesday.' text1 = 'police arrested five anti-nuclear protesters friday after they sought to disrupt loading of a french antarctic research and supply vessel , a spokesman for the protesters said .' text2 = 'turkmen president gurbanguly berdymukhammedov will begin a two-day visit to russia , his country \'s main energy partner , on monday for trade talks , the kremlin press office said .' text3 = 'israel \'s new government barred yasser arafat from flying to the west bank to meet with former prime minister shimon peres on thursday , a move palestinian officials said violated the israel-plo peace accords .' src = [text, text1, text2, text3] tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True) model.eval() if do_mask: mask_model = BertForMaskedLM.from_predtrained('bert-base-uncased') mask_model.eval() mask_token = tokenizer.mask_token mask_id = tokenizer.convert_tokens_to_ids([mask_token])[0] ignore_tokens = ['[CLS]', '[SEP]', '.', ',', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'said', '#', 'here', '<', 'news', '>', '`', 'has', 'have', 'will', 'the', 'a', 'is', 'was', 'are', 'on', 'as', 'after', 'this', 'in', 'with', 'to'] ignore_ids = tokenizer.convert_tokens_to_ids(ignore_tokens) if use_gpu: mask_model = mask_model.cuda() if use_gpu: assert torch.cuda.is_available(), 'GPU unavailable!' model = model.cuda() print('-- Using GPU --') device = torch.device('cuda') if use_gpu else torch.device('cpu') total_words_attended = 0 total_start_time = time.time() # if batch_size is None: # count = 0 # layers = [0, 9, 10, 11] # for line in src: # if count % 100 == 0: # print('Processed', count, 'lines') # count += 1 # line = line.strip() # tokens_tensor = str_to_idx_tensor(tokenizer, line) # if use_gpu: # tokens_tensor = tokens_tensor.cuda() # str_tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist()[0]) # outputs = model(tokens_tensor) # # cum_attn = [] # # for l in layers: # layer = outputs[2][l] # summed = layer.sum(dim=2).sum(dim=1).view(-1) # summed = (summed / summed.sum(dim=0)) # cum_attn.append(summed) # # all_attns = torch.stack(cum_attn).sum(dim=0) # normalized_attn = (all_attns / all_attns.sum(dim=0)).tolist() # sorted_by_attn = sorted(list(zip(normalized_attn, str_tokens)), key=lambda p: p[0], reverse=True) # # for p in sorted_by_attn[:top_k]: # attended_word_dict[p[1]] += 1 # # total_words_attended += len(sorted_by_attn[:top_k]) # if args.src is not None: # args.src.close() # batch implementation if all_layers: attended_word_tensor = torch.zeros(12, tokenizer.vocab_size, device=device) else: layers = torch.tensor([0], device=device) attended_word_tensor = torch.zeros(tokenizer.vocab_size, device=device) batch_iterator = chunked(src, batch_size) iter = 0 for batch in batch_iterator: start_time = time.time() if iter % 10 == 0: print('Processed', iter, 'batches') iter += 1 batch_toks, _, attn_mask, _ = batch_to_idx_tensor(tokenizer, batch) if use_gpu: batch_toks = batch_toks.cuda() attn_mask = attn_mask.cuda() with torch.no_grad(): outputs = model(batch_toks, attention_mask=attn_mask) attn = torch.stack(outputs[2]) # attn has shape [num_layers (12), batch_size, num_heads, max_src_len, max_src_len] if all_layers: summed = attn.sum(dim=3).sum(dim=2) # [num_layers, batch_size, max_src_len] summed = summed / summed.sum(dim=2, keepdim=True) # normalize summed.transpose_(0,1) # summed has shape [bath_size, num_layers, max_src_len] else: attn = attn.index_select(0, layers) summed = attn.sum(dim=3).sum(dim=2).sum(dim=0).view(attn.shape[1], attn.shape[4]) summed = summed / summed.sum(dim=1, keepdim=True) # normalize # summed has shape [batch_size, max_src_len] _, topk_idxs = summed.topk(top_k, sorted=True) # topk_idxs has shape [batch_size, top_k] # split by each item in batch split_batch_toks = batch_toks.split(1, dim=0) split_topk_idxs = topk_idxs.split(1, dim=0) for idxs, toks in zip(split_topk_idxs, split_batch_toks): attended_toks = toks.squeeze(0)[idxs.squeeze(0)] if all_layers: # record topk attended tokens for each layer for i in range(summed.shape[1]): attended_word_tensor[i, attended_toks[i]] += 1 else: attended_word_tensor[attended_toks] += 1 total_words_attended += topk_idxs.shape[0] * topk_idxs.shape[1] if args.src is not None: args.src.close() if all_layers: f = args.out f.write('Total tokens attended: {}\n '.format(total_words_attended)) for i in range(attended_word_tensor.shape[0]): non_zero_idxs = attended_word_tensor[i].nonzero().view(-1) counts = attended_word_tensor[i, non_zero_idxs].tolist() toks = tokenizer.convert_ids_to_tokens(non_zero_idxs.tolist()) attended_word_dict = DD(int) attended_word_dict.update(zip(toks, counts)) top_attn_count = sorted(attended_word_dict.items(),key=lambda p: p[1],reverse=True) out_top_k = 300 if not out_top_k else out_top_k f.write('\nLAYER {}\n'.format(i)) for p in top_attn_count[:out_top_k]: f.write(p[0] + ' ' + str(p[1]) + '\n') f.close() print('Finished, total duration = {:.4}'.format(time.time() - total_start_time)) else: non_zero_idxs = attended_word_tensor.nonzero().view(-1) counts = attended_word_tensor[non_zero_idxs].tolist() toks = tokenizer.convert_ids_to_tokens(non_zero_idxs.tolist()) attended_word_dict = DD(int) attended_word_dict.update(zip(toks, counts)) # end if of batch implementation print('Finished, total duration = {:.4}'.format(time.time() - total_start_time)) top_attn_count = sorted(attended_word_dict.items(),key=lambda p: p[1],reverse=True) f = args.out f.write('Total tokens attended: {}\n '.format(total_words_attended)) if out_top_k is not None: top_attn_count = top_attn_count[:out_top_k] for p in top_attn_count: f.write(p[0] + ' ' + str(p[1]) + '\n') f.close()
def run_auto(self): ''' test direct data feature based transfer accuracy on the new building ''' rf = RFC(n_estimators=100, criterion='entropy') rf.fit(self.train_fd, self.train_label) pred = rf.predict(self.test_fd) print('direct data feature-based transfer acc on tgt_bldg:', ACC(pred, self.test_label)) #plot_confusion_matrix(self.test_label, pred) ''' step1: train base models from bldg1 ''' self.get_base_learners() ''' step2: TL with name feature on bldg2 ''' label = self.test_label class_ = np.unique(self.train_label) for b in self.bl: print(b.score(self.test_fd, label)) n_class = 32 c = KMeans(init='k-means++', n_clusters=n_class, n_init=10) c.fit(self.test_fn) dist = np.sort(c.transform(self.test_fn)) ex_id = DD(list) #example id for each C for i, j, k in zip(c.labels_, range(len(self.test_fn)), dist): ex_id[i].append(int(j)) #getting neighors for each ex nb_c = DD() #nb from clustering results for exx in ex_id.values(): exx = np.asarray(exx) for e in exx: nb_c[e] = exx[exx != e] nb_f = [DD(), DD(), DD()] #nb from classification results for b, n in zip(self.bl, nb_f): preds = b.predict(self.test_fd) ex_ = DD(list) for i, j in zip(preds, range(len(self.test_fd))): ex_[i].append(int(j)) for exx in ex_.values(): exx = np.asarray(exx) for e in exx: n[e] = exx[exx != e] #use base learners' predicitons acc_ = [] cov_ = [] #for delta in np.linspace(0.1, 0.5, 5): for delta in np.linspace(self.agreement_threshold, self.agreement_threshold, 1): print('running TL with agreement threshold =', delta) labeled_id = [] confidence = [] output = DD() preds = np.array([999 for i in range(len(self.test_fd))]) for i in range(len(self.test_fn)): #get the weight for each bl: by computing sim btw cluster and clf w = [] v_c = set(nb_c[i]) for n in nb_f: v_f = set(n[i]) cns = len(v_c & v_f) / float( len(v_c | v_f)) #original count based weight #print (len(v_c & v_f) , len(v_c | v_f)) inter = v_c & v_f union = v_c | v_f d_i = 0 d_u = 0 for it in inter: d_i += np.linalg.norm(self.test_fn[i] - self.test_fn[it]) #print (np.linalg.norm(self.test_fn[i]-self.test_fn[it])) #input('...') for u in union: d_u += np.linalg.norm(self.test_fn[i] - self.test_fn[u]) if len(inter) != 0: sim = 1 - (d_i / d_u) / cns #sim = (d_i/d_u)/cns if i in output: output[i].extend( ['%s/%s' % (len(inter), len(union)), 1 - sim]) else: output[i] = [ '%s/%s' % (len(inter), len(union)), 1 - sim ] w.append(sim) output[i].append(np.mean(w)) if np.mean(w) >= delta: confidence.append(np.mean(w)) w[:] = [float(j) / sum(w) for j in w] pred_pr = np.zeros(len(class_)) for wi, b in zip(w, self.bl): pr = b.predict_proba(self.test_fd[i].reshape(1, -1)) pred_pr = pred_pr + wi * pr preds[i] = class_[np.argmax(pred_pr)] labeled_id.append(i) acc_.append(ACC(preds[preds != 999], label[preds != 999])) cov_.append(1.0 * len(preds[preds != 999]) / len(label)) print('acc =', acc_, ';') print('cov =', cov_, ';') return preds[preds != 999], labeled_id, confidence
def voteRefine(sequences, motifs): #get probabilities lets = "ACGT" probability = DD(int) for seq in sequences: for let in sequences[seq]: probability[let] += 1 s = sum(probability.values()) for let in lets: probability[let] = probability[let] / float(s) #conductPoll poll = {} maxV = 0 maxL = 0 for seq in sequences: poll[seq] = [0.0] * len(sequences[seq]) if len(sequences[seq]) > maxL: maxL = len(sequences[seq]) for tool in motifs: for motif in motifs[tool]: for seq in motifs[tool][motif]: sequence = best(sequences, seq) for pos in motifs[tool][motif][seq]: for i in xrange(pos, pos + len(motif)): try: # instead of weighting all results the same (1), we # could bias based on tool or number of results or something like that #poll[sequence][i - 1] += 1 if tool == "CMF": poll[sequence][i - 1] += 1 if tool == "Weeder": poll[sequence][i - 1] += 1 if tool == "MEME": poll[sequence][i - 1] += 1 if tool == "DECOD": poll[sequence][i - 1] += 1 if tool == "BioProspector": poll[sequence][i - 1] += 1 if tool == "XXmotif": poll[sequence][i - 1] += 1 except Exception as e: print e print 'It appears a tool has reported finding a motif',\ 'outside the bounds of a sequence' print 'such as finding a motif of length 10 at position',\ '195 in a sequence with length 200' pdb.set_trace() if poll[sequence][i - 1] > maxV: maxV = poll[sequence][i - 1] #inspectPoll ress = [] THRESH = 3.7 maxInsts = 0 MLEN = MOTIF_LEN for seq in poll: for i in xrange(len(poll[seq]) - MLEN): if sum(poll[seq][i:i + MLEN]) >= MLEN * THRESH: curr = sequences[seq][i:i + MLEN] bestPWM = None bestMatching = 0 for PWM in ress: matching = compMotifPWM(curr, PWM) if matching > bestMatching and matching > MLEN / 2: bestMatching = matching bestPWM = PWM if bestPWM == None: bestPWM = [[0, 0, 0, 0] for x in xrange(MLEN)] ress.append(bestPWM) for c, col in zip(curr, bestPWM): col[ALPH[c]] += 1 insts = sum(bestPWM[0]) if insts > maxInsts: maxInsts = insts votedRess = DD(int) for PWM in ress: l = len(PWM) cons = PWMconsensus(PWM) for seq in sequences: for spos in xrange(0, len(sequences[seq]) - l): # .75% thresh if compMotifPWM(sequences[seq][spos:spos + l], PWM) >= .75 * l: for pos in xrange(spos, spos + l): votedRess[cons] += poll[seq][pos] return sorted(votedRess.iteritems(), key=lambda a: a[::-1])
def __init__(self): self._errors = DD(list) self._workbook_fp = None self._days_ago = 0