def get_contact_timeline(id,type): db = init_db() contact = db.query(profile).filter(profile.id==id).first() if contact is None: return False temp = defaultdict(dict) temp['label'] = type temp1 = defaultdict(int) if type in ["email","chat"]: for chats in contact.chats: if type == "email": if chats.type == CHAT_EMAIL: temp1[int(chats.ts*1000)] +=1 if type == "chat": if chats.type not in [CHAT_EMAIL,CHAT_SOCIALMEDIA,CHAT_TWITTER]: temp1[int(chats.ts*1000)] +=1 if type == "feedback": for feedbacs in contact.feedbacks: temp1[int(feedbacs.ts*1000)] +=1 if type=="twitter": tweets = db.query(tweet).filter(tweet.mentioner==contact.twitter) for obj in tweets: temp1[int(obj.ts*1000)] +=1 temp['data'] = temp1.items() temp['data'].sort() print temp db.close() return temp
def __init__(self, cols=[], rows=[]): super(Table, self).__init__() self.cols = cols self.rows = rows self.cells = defaultdict(lambda : defaultdict(Decimal)) self.col_totals = defaultdict(Decimal) self.row_totals = defaultdict(Decimal)
def _get(request, catfn=lambda tx:tx.entity.category): title = 'All Transactions' account_id = None entity_id = None category_id = None amount_filter = 0.00 if request.GET: account_id = request.GET.get('account_id', None) entity_id = request.GET.get('entity_id', None) category_id = request.GET.get('category_id', None) amount_filter = float(request.GET.get('amount', amount_filter)) transactions = Transaction.objects.filter(posted__gte=first_of_year).filter(Q(amount__gte=amount_filter) | Q(amount__lte=(amount_filter*-1.0))) if account_id: title = 'Account = {}'.format(Account.objects.get(id=account_id).heading) transactions = transactions.filter(account__id=account_id) if entity_id: title = 'Entity = {}'.format(Entity.objects.get(id=entity_id).heading) catfn=lambda tx:tx transactions = transactions.filter(entity__id=entity_id) if category_id: title = 'Category = {}'.format(Category.objects.get(id=category_id).heading) catfn=lambda tx:tx.entity transactions = transactions.filter(entity__category__id=category_id) total_table = Table(cols=months) for tx in transactions: month = tx.posted.strftime('%b').upper() amount = tx.amount category = catfn(tx) total_table.add(month, category, amount) accounts = Account.objects.all() balances = Balance.objects.all() total = sum([t.amount for t in transactions]) balance_dates = set([b.as_of_date for b in balances]) balance_amounts = defaultdict(lambda : defaultdict(Decimal)) balance_totals = defaultdict(Decimal) for balance in balances: date = balance.as_of_date amount = balance.signed_amount account = balance.account balance_dates.add(date) balance_amounts[date][account] += amount balance_totals[date] += amount balance_dates = sorted(list(balance_dates)) return render_to_response('index.html', {'title':title, 'transactions':transactions, 'accounts':accounts, 'balances':balances, 'totals':total_table, 'balance_dates' : balance_dates, 'balance_amounts' : balance_amounts, 'balance_totals' : balance_totals, 'total':total}, RequestContext(request))
def A_priori(infile,support): infile=getDataFromFile(infile) #Ekteloume tin diadikasia getDataFromFile lineList= list() #Dimiourgoume tin lista lineList Items=set() #Dimiourgoume to set me onoma Items etsi wste na proshtesoume ola ta stoixeia pou periexontai sta arxeia ws dedomena!!Me to set tha apaloifthoun kai oi diplotipes eggrafes diladi den tha exoume 2 fores to idio item!! for line in infile: #To frozenset dilwnei oti den borei na allaksei to periexomeno tou!! lineList.append(frozenset(line)) #Prosthetw sto telos kathe fora tin kathe grammi pou periexei to arxeio pou exoume anoiksei apo ti diadikasia getDataFromFile!! for item in line: Items.add(frozenset([item])) #Prosthetw to Items tis lekseis pou exei kathe grammi!! All_freqSet=defaultdict(int) #Epistrefei ena leksiko san antikeimeno pou einai mia ipokatigoria tou dict!Edw tha bainoun ola ta stoixeia stis lekseis kai me ton arithmo emfanisi tous sta kalathia pou einai oi grammes! freqk=defaultdict(int) #Tha xrisimopoietai gia na periexe tis sixnotites ton stoixeiwsinolwn gia to kathe kalathi se ola ta kalathia!! allset=dict() k=2 #Edw kanoume to prwto perasama apo ton algorithmo A_Priori!! fristpass,allset_temp=FindItemsFirstTime(lineList,Items,support,freqk) #sto firstpass,kai sto allset_temp pername ta apotelesmata tis sinartisis findItemsFirstTime!! freqk=fristpass #Edw topothetoume to firstpass se to dict freqk!! #Twra oso to dict freqk einai gemato kane to eksis: while (bool(freqk)): All_freqSet.update(freqk) #Vale tis sixnotites twn stoixeiosinolwn pou vrikame sto kathe perasama sto all_freqSet allset[k-1]=allset_temp #Sto all set vazoume to allset_temp pou exoume parei ws apoteelsma apo to FindItemsFirstTime pou tha periexei kathe fora ta stoxeia pou pernane ton elegxo me ton elaxisto arithmo ipostiriksis!! anotherpass,allset_temp=AprioriPass(lineList,freqk,support,k) #Ekteloume ton algorithmo AprioriPass pou einai ta epomena vimata tou algorithmou A_Priori kai epistrefei ta apotelesmata kai ta vazei sto anotherpass kai sto allset_temp freqk=anotherpass #To freqk ginetai iso me to neo sinolo sixnotitwn pou einai apo to anotherpass!! k+=1 #auksisi tou megethos twn newn stoixeiwn!! #Edw Dimiorgoume mia lista me tin onomasia final_fREQ pou tha periexei ta stoixeia pou exoume parei apo ton A_Priori mazi me tis sixnotites tous!! final_freq=[] for key,value in All_freqSet.items(): final_freq.extend(([tuple(key),All_freqSet[key]])) #sto final_Freq tha periexei prwta to stoixeiosinolo kai meta tin sixnolotita emfanisi tou sto basket!! return final_freq,allset #Epistrefoume to final_freq me tis sixnotites twn stoixeiosinolown kai to to allset pou periexei ola ta stoixeiosinola kai ta nea stoixeiosinola pou pernoume apo tin ektelesi tou A_Priori!
def __init__(self, problem): self.problem = problem self.s = None self.a = None self.untried = defaultdict(list) self.unbacktracked = defaultdict(list) self.result = {}
def learning_dictionary(self,data): dict_count_first_word=defaultdict(int) dict_count_each_word=defaultdict(int) dict_count_each_part_of_speech=defaultdict(int) dict_count_part_of_speech_CP=defaultdict(int) dict_count_word_part_of_speech=defaultdict(int) total_number_of_words=0 for i in range(0,len(data)): dict_count_first_word[data[i][0][0]]=dict_count_first_word[data[i][0][0]]+1 for j in range(0,len(data[i][0])): total_number_of_words+=1 dict_count_each_word[data[i][0][j]]=dict_count_each_word[data[i][0][j]]+1 dict_count_each_part_of_speech[data[i][1][j]]=dict_count_each_part_of_speech[data[i][1][j]]+1 if j<len(data[i][0])-1: CP_part_of_speech=data[i][1][j] CP_part_of_speech+="-" CP_part_of_speech+=data[i][1][j+1] dict_count_part_of_speech_CP[CP_part_of_speech]=dict_count_part_of_speech_CP[CP_part_of_speech]+1 Word_part_of_speech=data[i][0][j] Word_part_of_speech+="-" Word_part_of_speech+=data[i][1][j] dict_count_word_part_of_speech[Word_part_of_speech] = dict_count_word_part_of_speech[Word_part_of_speech]+1 return dict_count_first_word,dict_count_each_word,dict_count_each_part_of_speech,dict_count_part_of_speech_CP,dict_count_word_part_of_speech,total_number_of_words
def load_big_from_figer_out(figerfile, numtype, myt2i): logger.info('loading figer results in a big matrix from %s', figerfile) f = open(figerfile) c = 0; big = defaultdict(lambda: defaultdict(list)) for line in f: parts = line.strip().split('\t') if len(parts) < 3: continue if parts[1].startswith('B-')==False: continue emid = parts[2].strip() # scores = [0.0 for i in range(numtype)] for i in range(3, len(parts)): (mytype, score) = parse_one_type(parts[i]) if mytype not in myt2i: continue scores[myt2i[mytype]] = score assert len(scores) == numtype scores = mynormalize(scores) for j in range(numtype): big[emid][j].append(scores[j]) c +=1 if c == upto: break # logger.info('entity number: %d', c) logger.info('big has %d entities', len(big)) return big
def PRiter(iterations=20): iter_time = time.time() nodes = NodeBuilder() iter_nodelist = defaultdict(lambda: 0.15) cur_nodelist = defaultdict(lambda: 0.85) print "iterations started", time.time() - iter_time for iter in xrange(iterations): iter_time = time.time() for data in files: checker(*data, dd=nodes, cur_nodelist=cur_nodelist, iter_nodelist=iter_nodelist) cur_nodelist, iter_nodelist = iter_nodelist, cur_nodelist iter_nodelist = defaultdict(lambda : 0.15) print time.time() - iter_time, "for iteration", iter sorted_nodes_top1000 = sorted(cur_nodelist.items(), key=itemgetter(1), reverse=True) [:1000] for name in sorted_nodes_top1000: if name[0][0] == "c": print name
def find_vm_addr(trace): """ Find the virtual machine addr :param trace: instruction trace :return: virtual function start addr """ push_dict = defaultdict(lambda: 0) vm_func_dict = defaultdict(lambda: 0) # try to find the vm Segment via series of push commands, which identify the vm_addr also for line in trace: try: if line.disasm[0] == 'push': push_dict[GetFunctionAttr(line.addr, FUNCATTR_START)] += 1 except: pass vm_func = max(push_dict, key=push_dict.get) vm_seg_start = SegStart(vm_func) vm_seg_end = SegEnd(vm_func) # test wheather the vm_func is the biggest func in the Segment vm_funcs = Functions(vm_seg_start, vm_seg_end) for f in vm_funcs: vm_func_dict[f] = GetFunctionAttr(f, FUNCATTR_END) - GetFunctionAttr(f, FUNCATTR_START) if max(vm_func_dict, key=vm_func_dict.get) != vm_func: return AskAddr(vm_func, "Found two possible addresses for the VM function start address: %s and %s. Choose one!" % (vm_func, max(vm_func_dict, key=vm_func_dict.get))) else: return vm_func
def __init__(self, file_url): self.file_url = file_url self.file_url_out = file_url + ".out" self.dist = None self.users = None self.feeds = None self.movements = defaultdict(list)#LocationRecord self.moveint = defaultdict(list)#LocationRecord
def load_big_matrix(big_file): type2entprobs = defaultdict(lambda: defaultdict(list)) with open(big_file) as fp: for line in fp: parts = line.split() for i, p in enumerate(parts[1:]): type2entprobs[i][parts[0]].append(float(p)) logger.info('loading the big matrix %s finished', big_file) return type2entprobs
def calc_contacts_per_cluster_per_motif(contacts_per_cluster, protein_motifs, ordered_motifs, weight): contacts_per_motif = defaultdict(lambda: defaultdict(int)) for motif_name in ordered_motifs: if motif_name in protein_motifs: for cluster_id in contacts_per_cluster: for res_id in contacts_per_cluster[cluster_id]: res = int(res_id.split(":")[0]) if res >= protein_motifs[motif_name][0] and res < protein_motifs[motif_name][1]: contacts_per_motif[cluster_id][motif_name] += 1/weight return contacts_per_motif
def test_keyerror_without_factory(self): from _collections import defaultdict for d1 in [defaultdict(), defaultdict(None)]: for key in ['foo', (1,)]: try: d1[key] except KeyError, err: assert err.args[0] == key else: assert 0, "expected KeyError"
def read_fa(data)-> dict: DATA = data.read().split() myDict = defaultdict(defaultdict) for e in DATA: D = e.split(';') tempDict = defaultdict(str) for j in range(1, len(D), 2): tempDict[D[j]] = D[j+1] myDict[D[0]] = tempDict return myDict
def probability_dictionary(self,data): different_parts_of_speech=['ADJ','ADV','ADP','CONJ','DET','NOUN','NUM','PRON','PRT','VERB','X','.'] part_of_speech_probability=defaultdict(int) sum_of_parts_of_speech=sum(self.dict_count_part_of_speech_CP.values()) for i in range(0,len(different_parts_of_speech)): for j in range(0,len(different_parts_of_speech)): #if(different_parts_of_speech[i]!=different_parts_of_speech[j]): #self.dict_count_part_of_speech_CP={} #e.g.,(dict["noun-verb"]=5) temp_part_of_speech="" temp_part_of_speech+="cp_" temp_part_of_speech+=different_parts_of_speech[i].lower() temp_part_of_speech+="|" temp_part_of_speech+=different_parts_of_speech[j].lower() a_int_b=self.dict_count_part_of_speech_CP[different_parts_of_speech[j].lower()+"-"+different_parts_of_speech[i].lower()] prob_a_int_b=a_int_b/sum_of_parts_of_speech #[p("verb")=0.5] prob_part_of_speech=float(self.dict_count_each_part_of_speech[different_parts_of_speech[j].lower()])/float(self.total_number_of_words) self.dict_prob_each_part_of_speech[different_parts_of_speech[j].lower()]=prob_part_of_speech if(prob_part_of_speech!=0): part_of_speech_probability[temp_part_of_speech]=prob_a_int_b/prob_part_of_speech else: part_of_speech_probability[temp_part_of_speech]=0.0005 # self.dict_count_word_part_of_speech={}#e.g., (dict[hari-noun]=6) #self.dict_count_each_word(dict[hari]=6) #Word_part_of_speech=data[i][0][j] #Word_part_of_speech+="-" #Word_part_of_speech+=data[i][1][j] word_probability=defaultdict(int) for i in range(0,len(data)): for j in range(0,len(data[i][0])): word=data[i][0][j] for k in range(0,len(different_parts_of_speech)): count_word=self.dict_count_each_word[word] count_part_of_speech=self.dict_count_each_part_of_speech[different_parts_of_speech[k]] count_word_int_part_of_speech=self.dict_count_word_part_of_speech[word+"-"+different_parts_of_speech[k].lower()] probability_of_part_of_speech=self.dict_prob_each_part_of_speech[different_parts_of_speech[k].lower()] temp_word="" temp_word+="cp_" temp_word+=word temp_word+="|" temp_word+=different_parts_of_speech[k].lower() if(probability_of_part_of_speech!=0 and count_part_of_speech!=0): word_probability[temp_word]=(float(count_word_int_part_of_speech)/float(count_part_of_speech))/probability_of_part_of_speech else: word_probability[temp_word]=0.0005 #print part_of_speech_probability #print word_probability return part_of_speech_probability,word_probability
def loadViewSelectionPlan(cls,filename): inputfile = open(filename,"r") feedMap = {} userMap = {} viewMap = {} feedviewMap = defaultdict(set) queryPlan = defaultdict(set) lcount = 0 totFeed = 0 totUser = 0 totView = 0 totFvm = 0 totQp = 0 for line in inputfile: if lcount == 0: elem = line.strip().split("\t") totFeed = int(elem[0]) totUser = int(elem[1]) totView = int(elem[2]) totFvm = int(elem[3]) totQp = int(elem[4]) elif lcount <= totFeed: f = FeedInfo.loadFromLine(line) feedMap[f.feed_id] = f elif lcount <= totFeed+totUser: u = UserInfo.loadFromLine(line) userMap[u.user_id] = u elif lcount <= totFeed+totUser+totView: v = View.loadFromLine(line, feedMap, userMap) viewMap[v.view_id] = v elif lcount <= totFeed+totUser+totView+totFvm: elem = line.strip().split("\t") fid = elem[0] viewset = set() for i in range(1,len(elem)): viewset.add(viewMap[elem[i]]) feedviewMap[fid]=viewset elif lcount <= totFeed+totUser+totView+totFvm+totQp: elem = line.strip().split("\t") uid = elem[0] viewset = set() for i in range(1,len(elem)): viewset.add(viewMap[elem[i]]) queryPlan[userMap[uid]] = viewset else: print("Error Line: " + line) lcount += 1 inputfile.close() Plan = collections.namedtuple('Plan',['ViewMap','FeedMap','UserMap','FeedViewMap','QueryPlan']) P = Plan(ViewMap=viewMap,FeedMap=feedMap,UserMap=userMap,FeedViewMap=feedviewMap,QueryPlan=queryPlan) return P
def features(docList): import time # download model (only needs to be done once) model_dir = download_and_install_model('WSJ', '/tmp/models') # Loading the model is slow, but only needs to be done once rrp = RerankingParser.from_unified_model_dir(model_dir) rrp.set_parser_options(nbest = 5) features = [] scores = [] with open("output_log.txt", "w") as logF, open("syn_feats.pkl", "w") as synFile, open("syn_scores.pkl", "w") as scoresFile: for i, doc in enumerate(docList): start_time = time.time() features.append(defaultdict(float)) scores.append(defaultdict(list)) for sentence in doc: parses = rrp.parse(sentence, rerank=False) #print(len(parses)) #print(sentence, file = logF) try: parse_score = parses[0].parser_score rerank_score = parses[0].reranker_score scores[i]['parse'].append(parse_score) scores[i]['rerank'].append(rerank_score) scores[i]['sent_length'].append(len(parses[0].ptb_parse.tokens())) best_parse = parses[0].ptb_parse # print(best_parse, file = logF) for t in best_parse.all_subtrees(): levels = buildSubtrees(t) for l in levels: features[i][l] += 1.0 except: print("No parse available - skipping") features[i] = {x:v for x,v in features[i].items()} print("{0}".format(sorted(features[i].items(), key=operator.itemgetter(1), reverse=True)), file = logF) print("--- {0} seconds for {1} sentences ---" .format(time.time() - start_time, len(doc))) pickle.dump(features, synFile) pickle.dump(scores, scoresFile) # t_bllip = Timer(lambda: rrp.parse(sentence)) # print ("bllip", t_bllip.timeit(number=5)) pass
def main(): emit = defaultdict(int) transition = defaultdict(int) context = defaultdict(int) states = set() start = defaultdict(int) wordToTag = dict() output = open("hmmoutput.txt", 'w') totalStart = 0 with open("hmmmodel.txt") as f: for line in f: if line.startswith("T"): temp = line.rstrip("\n").split(" ") transition[temp[1].strip()+" "+temp[2].strip()] = float(temp[4].strip()) if temp[1].strip().startswith("<s>"): start[str(temp[2].strip())] += int(temp[3].strip()) totalStart += int(temp[3].strip()) elif line.startswith("E"): temp = line.rstrip("\n").split(" ") emit[temp[1].strip()+" "+temp[2].strip()] = float(temp[3].strip()) elif line.startswith("C"): temp = line.rstrip("\n").split(" ") context[temp[1].strip()] = int(temp[2].strip()) states.add(temp[1].strip()) elif line.startswith("W"): temp = line.rstrip("\n").split(" ") word = temp[1].strip() if word not in wordToTag: wordToTag[word] = set() tags = temp[2].strip().rstrip(",").strip() tags = tags.split(",") for tag in tags: wordToTag[word].add(tag.strip()) for i in states: start[i] = start[i] * totalStart with open(sys.argv[1]) as f: for line in f: obs = line.rstrip("\n").split(" ") listOfTags = viterbi(obs, states, start, transition, emit, context, wordToTag) for i in range(len(listOfTags)-1, -1, -1): output.write(str(listOfTags[i])+" ") output.write("\n")
def main(): emit = defaultdict(int) transition = defaultdict(int) context = defaultdict(int) wordToTag = dict() states = set() result=open('hmmmodel.txt', 'w') with open(sys.argv[1]) as f: for line in f: previous = "<s>" context[previous] += 1 wordtags = line.strip().split(" ") for wordtag in wordtags: word = wordtag[:len(wordtag)-3] tag = wordtag[-2:] transition[previous+" "+tag] += 1 context[tag] += 1 emit[tag+" "+word] += 1 previous = tag if word not in wordToTag: wordToTag[word] = set() wordToTag[word].add(tag) states.add(tag) transition[previous+" </s>"] += 1 numberOfStates = len(states) for key in transition: previous, tag = key.split(" ") result.write("T " + key + " " + str(transition[key])+ " " + str((transition[key] + 1)/(context[previous] + numberOfStates)) +"\n") for key in emit: tag, word = key.split(" ") result.write("E " + key + " " + str(emit[key]/ context[tag]) + "\n") for key in context: result.write("C " + key + " " + str(context[key]) + "\n") for key in wordToTag: s = "W " + key + " " for tag in wordToTag[key]: s += str(tag) + "," s += "\n" result.write(s)
def AprioriPass(lineList,freqk,Support,k): items_pairs=GetPairs(freqk,k) #ekteloume tin diadikasia getPairs pou pairnoume ta dinata zeugaria apo ta stoixeiosinola pou exoume ws orisma apo tin sinartisi AprioriPass freqk=defaultdict(int) #to freqk gia na vroume tin sixnotita twn newn stoixeiosinolwn! item_temp=set() #Ta antikeimena pou tha exoun kseperasei ton elegxo tis sixnotitas tous me to support! local_temp=defaultdict(int) for items in items_pairs: #gia kathe item pou exoume parei apo to GetPairs tote exoume: for line in lineList: #gia kathe grammi tou arxeiou exoume if items.issubset(line): #An to item periexete stin grammi tote local_temp[items]+=1 # #Auksise to local_temp tou item kata 1! for item1,count in local_temp.items(): #Twra gia antikeimeno tou local_temp pairnoume to item1(key) kai to count(value) tote: if(count >= Support): #An to count einai megalitero tou support tote freqk[item1]=count #vale sto freqk tou item1 to count(tin sixnotita) item_temp.add(item1) #kai vale to antikeimeno sto item_temp return freqk,item_temp #Epistrefoume to freqk(ta stoixeiosinla me tin sixnotita tous) kai to item_temp(ta stoixeiosinola tou k+1 a_priori)
def __init__(self, config, globalDict, parent=None): VoltageGlobalAdjustForm.__init__(self) VoltageGlobalAdjustBase.__init__(self, parent) self.config = config self.configname = 'VoltageGlobalAdjust.Settings' self.settings = self.config.get(self.configname, Settings()) self.globalAdjustDict = SequenceDict() self.myLabelList = list() self.myBoxList = list() self.historyCategory = 'VoltageGlobalAdjust' self.adjustHistoryName = None self.globalDict = globalDict self.adjustCache = self.config.get(self.configname+".cache", dict()) self.savedValue = defaultdict( lambda: None ) self.displayValueObservable = defaultdict( lambda: Observable() )
def fillUsingLines(linespath): e2name2freq = defaultdict(dict) f = open(linespath) for line in f: parts = line.split('\t') for w in parts[4].split(): if '/m/' in w: (mid, tokens, notabletype) = getentparts(w) name = ' '.join(tokens) if mid not in e2name2freq: e2name2freq[mid] = defaultdict(lambda: 0) e2name2freq[mid][name] += 1 f.close() return e2name2freq
def mapLocationToFeed(self): if self.feeds is not None and self.movements is not None: gridEventMap = defaultdict(list) for fid,locrlist in self.movements.items(): for locr in locrlist: loc = Location(locr.latitude,locr.longitude) if loc in gridEventMap.keys(): gridEventMap[loc].append(locr) else: elist = list() elist.append(locr) gridEventMap[loc] = elist self.movements.clear() self.feeds.clear() for gid,elist in gridEventMap.items(): intv = list() for locr in elist: intv.append(locr.loc_timestamp) totintv = 0.0 if len(intv) > 1: for i in range(0,len(intv)-1): totintv += abs((intv[i+1] - intv[i]).total_seconds() / LocationRecord.TimeScale) totintv /= len(intv) - 1 lid = "U"+str(gid.x_dimension)+"-"+str(gid.y_dimension) new_feed = FeedInfo(lid,gid,FeedInfo.default_tag,int(math.ceil(totintv))) self.feeds[lid] = new_feed
def word_count(words, existing_list=None): result = existing_list if existing_list else defaultdict(int) for word in words: if word == '\'s': continue result['{0}'.format(word)] += 1 return result
def get_high_information_words(lwords, score_fn=BigramAssocMeasures.chi_sq, min_score=5): labels = lwords.keys() labelled_words = [(l, lwords[l]) for l in labels] word_freq_dist = FreqDist() label_word_freq_dist = ConditionalFreqDist() for label, dwords in labelled_words: for words in dwords: for word in words: word_freq_dist[word] += 1 label_word_freq_dist[label][word] += 1 n_words_total = label_word_freq_dist.N() high_info_words = set() for label in label_word_freq_dist.conditions(): n_words_label = label_word_freq_dist[label].N() word_scores = defaultdict(int) for word, word_freq_label in label_word_freq_dist[label].items(): word_freq = word_freq_dist[word] score = score_fn(word_freq_label, (word_freq, n_words_label), n_words_total) word_scores[word] = score bestwords = [word for word, score in word_scores.items() if score >= min_score] high_info_words |= set(bestwords) return high_info_words
def solve(par): C, combine, D, opposite, N, S = par comb = {} for c in combine: x = list(c)[:2] comb[tuple(x)] = c[2] x.reverse() comb[tuple(x)] = c[2] oppo = defaultdict(list) for o in opposite: oppo[o[0]].append(o[1]) oppo[o[1]].append(o[0]) result = [] for s in list(S): if len(result) > 0 and (result[-1], s) in comb: c = result[-1] result.pop() result.append(comb[(c, s)]) continue flag = True if s in oppo: for x in oppo[s]: if x in result: result = [] flag = False break if flag: result.append(s) return '[' + ', '.join(result) + ']'
def formDictionary(file): """For creating a dictionary of word:phonemes from the CMU phoneme dictionary""" phoneme_dictionary = defaultdict() for line in open(file): if line[0].isalpha(): #This is where we start the dictionary, skipping over all punctuation and developer notes in the beginning. split = line.split() #Split is a list of the line item separated by whitespace for item in split: #item is each individual segment of the line if item[len(item)-1] == ")":#If word is duplicate and has a (1) or similar sequence after it, cut it out. There are multiple pronunciations for this word. item = item[0:len(item)-3] split[0] = item #First item in line is now edited word #split[split.index(item)] = item[0:len(item)-3] if not item[len(item)-1].isalpha(): #if the item is not entirely letters, cut the last item off. split[split.index(item)] = item[0:len(item)-1] if split[0] in phoneme_dictionary.keys(): #if this word is already in dictionary, make a list out of it's existing list list = [phoneme_dictionary[split[0]], split[1:] ] #phoneme_dictionary[split[0]] = existing list for duplicate phonemes, next index is current list of phonemes phoneme_dictionary[split[0]] = list else: phoneme_dictionary[split[0]] = split[1:]#dictionary of {word:[phoneme list]} #print split current_path = open("phoneme_output.txt", "w") for entry in phoneme_dictionary: current_path.write(entry + str(phoneme_dictionary.get(entry)) + "\n") current_path = open("phon_pickle.txt", "w") pickle.dump(phoneme_dictionary,current_path, pickle.HIGHEST_PROTOCOL) current_path.close() return phoneme_dictionary
def test_default_factory(self): import _collections f = lambda: 42 d = _collections.defaultdict(f) assert d.default_factory is f d.default_factory = lambda: 43 assert d['5'] == 43
def read_graph(file)-> dict: DATA = file.read().split() myDict = defaultdict(set) for e in DATA: E = e.split(';') myDict[E[0]].add(E[1]) return myDict
def counting_sort(A, key=lambda x:x): B, C = [], defaultdict(list) for x in A: C[key(x)].append(x) for k in range(min(C), max(C)+1): B.extend(C[k]) return B
def gviz_representation(pred_map): child_lookup =defaultdict(lambda: None) active_nodes = [] for c in pred_map: if pred_map[c]: for p in pred_map[c]: if p not in child_lookup: child_lookup[p]=[c] else: child_lookup[p].append(c) rstr = "digraph {\n node [shape=\"circle\"];\n" for p in pred_map: c_list = child_lookup[p] if(c_list): c_str = " ".join(str(c.id) for c in c_list) rstr += str(p.id) + "->{" + c_str + "}\n" rstr+="}\n" return rstr
def PriorFociWithSimilarFringe(self, *, current_focus, timestamp, threshold=0.2, decay_factor=0.97): """Gets prior items with overlapping fringe.""" scores = defaultdict(float) for fe, wt in current_focus.stored_fringe.items(): for other_focusable, other_wt in self.fringe_element_to_item_to_wt[ fe].items(): if other_focusable is not current_focus: scores[other_focusable] += other_wt * wt out = [] for other_focusable in scores.keys(): age = max(0, timestamp - self.last_focus_time[other_focusable]) scores[other_focusable] *= (decay_factor**age) if scores[other_focusable] >= threshold: out.append((other_focusable, scores[other_focusable])) return sorted(out, reverse=True, key=lambda x: x[1])
def get_costs(self): #method to create costs dictionary maxCost = 0 edge = None costs = defaultdict(lambda: 1000) #default dic w lambda at "infinity" costForTrain = {} for i in self.Edges: #go through edges costs[(i[0], i[1])] = i[2] #put in the cost costs[(i[1], i[0])] = i[2] if i[4] == 'B': print("i: ", i) print("i2: ", i[2]) costForTrain[(i[0], i[1], i[2])] = i[2] #put in the cost for i in self.Verticies: #go through verticies costs[(i[0], i[0])] = 0 #cost from self to self totalSpending = 0 ''' while totalSpending <= 100000000 and costForTrain[max(costForTrain.keys())] > 2: totalSpending += costForTrain[max(costForTrain.keys())]*1000000 if totalSpending > 100000000: break self.profit -= costForTrain[max(costForTrain.keys())]*1000000 max(costForTrain.keys())[2] = 2 costs[(max(costForTrain.keys())[0], max(costForTrain.keys())[1])] = 2 costs[(max(costForTrain.keys())[1], max(costForTrain.keys())[0])] = 2 ''' while totalSpending <= 100000000 and costs[max(costs.keys())] > 2: totalSpending += costs[max(costs.keys())] * 1000000 if totalSpending > 100000000: break self.profit -= costs[max(costs.keys())] * 1000000 edge = None for i in self.Edges: if max(costs.keys())[0] == i[0] and max( costs.keys())[1] == i[1]: edge = i break edge[2] = 2 costs[(max(costs.keys())[0], max(costs.keys()[1]))] = 2 costs[(max(costs.keys())[1], max(costs.keys()[0]))] = 2 return costs #return the dictionary
def checkIfPrerequisite(self, n: int, prerequisites, queries): self.graph = defaultdict(list) for pre in prerequisites: self.graph[pre[1]].append(pre[0]) print(self.graph) self.yz = [[] for i in range(n)] def bfs(s): visited = [False] * n queue = [s] visited[s] = True while queue: cur = queue.pop(0) self.yz[s].append(cur) for i in self.graph[cur]: if visited[i] == False: queue.append(i) visited[i] = True for i in range(n): bfs(i) print(self.yz) #return self.yz # [[0, 1], [1]] print(self.yz) ans = [] for query in queries: if len(self.yz[query[1]]) == 1: ans.append(False) else: if query[0] in self.yz[query[1]]: ans.append(True) else: ans.append(False) return ans
def refresh_schema(pk): try: database = Database.objects.get(pk=pk) schema_query = Schema_Query.get(database.db_type) schema_data = [] if database.db_type != 'sqlserver': flag, schema_data = run_sql(database, schema_query) if not flag: raise build_exception_from_java(schema_data) else: schema_data = sqlserver_schema_data(database) schema_dic = defaultdict(OrderedDict) type_map = Type_TO_CN.get(database.db_type) for x in schema_data: owner = x.get('OWNER') object_type = type_map.get(x.get('OBJECT_TYPE')) object_name = x.get('OBJECT_NAME') if not schema_dic.get(owner) or not schema_dic.get(owner).get( object_type): schema_dic[owner][object_type] = [] schema_dic[owner][object_type].append(object_name) detail = OrderedDict(sorted(schema_dic.items())) created_at = datetime.now().replace(microsecond=0) schema = DB_SCHEMA.objects.update_or_create(database=database, defaults={ 'detail': detail, 'created_at': created_at }) Key_Template = f'''{pk}:schema:*''' for key in redis.scan_iter(Key_Template): redis.delete(key) get_table_rows(database) return detail except ObjectDoesNotExist: return {'error_message': ''} except Exception as err: return {'error_message': str(err)}
def get_sim_item(df_, user_col, item_col, use_iif=False): df = df_.copy() user_item_ = df.groupby(user_col)[item_col].agg(list).reset_index() user_item_dict = dict(zip(user_item_[user_col], user_item_[item_col])) user_time_ = df.groupby(user_col)['time'].agg(list).reset_index() # 引入时间因素 user_time_dict = dict(zip(user_time_[user_col], user_time_['time'])) sim_item = {} item_cnt = defaultdict(int) # 商品被点击次数 for user, items in tqdm(user_item_dict.items()): for loc1, item in enumerate(items): item_cnt[item] += 1 sim_item.setdefault(item, {}) for loc2, relate_item in enumerate(items): if item == relate_item: continue t1 = user_time_dict[user][loc1] # 点击时间提取 t2 = user_time_dict[user][loc2] sim_item[item].setdefault(relate_item, 0) if not use_iif: if loc1 - loc2 > 0: sim_item[item][relate_item] += 1 * 0.7 * (0.8**( loc1 - loc2 - 1)) * (1 - (t1 - t2) * 10000) / math.log( 1 + len(items)) # 逆向 else: sim_item[item][relate_item] += 1 * 1.0 * (0.8**( loc2 - loc1 - 1)) * (1 - (t2 - t1) * 10000) / math.log( 1 + len(items)) # 正向 else: sim_item[item][relate_item] += 1 / math.log(1 + len(items)) sim_item_corr = sim_item.copy() # 引入AB的各种被点击次数 for i, related_items in tqdm(sim_item.items()): for j, cij in related_items.items(): sim_item_corr[i][j] = cij / ((item_cnt[i] * item_cnt[j])**0.2) return sim_item_corr, user_item_dict
def parse_mscx(fh, voices, out, nverses=1): doc = pulldom.parse(fh) level = 0 tracks = {} staff_tracks = defaultdict(set) for event, node in doc: if event == pulldom.START_ELEMENT: if level == 2 and node.tagName == "Part": track_name, staff_ids = parse_part(doc) if track_name and staff_ids: tracks[track_name] = staff_ids for staff_id in staff_ids: staff_tracks[staff_id].add(track_name) elif level == 2 and node.tagName == "Staff" and staff_tracks[ node.getAttribute("id")] & voices: log.info("Parsing staff with ID [%s]" % node.getAttribute("id")) parse_staff(doc, out, nverses=nverses) else: if log.getEffectiveLevel() <= logging.DEBUG and level < 3: log.debug("%sGot <%s>." % (" " * level, node.tagName)) level += 1 elif event == pulldom.END_ELEMENT: level -= 1 if log.getEffectiveLevel() <= logging.DEBUG and level < 3: log.debug("%sGot </%s>." % (" " * level, node.tagName)) log.info("Found tracks [%s]" % (tracks, ))
def fetch_and_create_graph(self): ''' Get data using requests For each word in the response, create and add them to `buckets` WELSH goes into W_LSH, WE_SH, WEL_H and WELS_ buckets Second step, spin a graph amongst the buckets :return: ''' buckets = defaultdict(set) # # Test data if self.test: line_iter = t_data # fetch data else: response = requests.get(self.uri) line_iter = response.iter_lines() next(line_iter) # parse words for line in line_iter: for word in line.split(): word = word.decode("utf-8") self.all_words.add(word) for i in range(len(word)): # create or get buckets of neighbors bucket = word[:i] + '_' + word[i + 1:] buckets[bucket].add(word) print(f'found {len(self.all_words)} words') # second step for bucket in buckets: for word1 in buckets[bucket]: for word2 in buckets[bucket]: if word1 != word2: self.graph[word1].add(word2) self.graph[word2].add(word1)
def __init__(self, top_url, max_level=5, max_links=50): self.top_url = top_url self.max_level = max_level self.max_links = max_links self.observed_links = {} # page url -> links self.visited_links = {} # page number -> link self.product_links = set() self.printed_skipped_urls = set() self.num_visited_pages = 0 self.num_walks = 0 # links that redirect to other domains etc. self.blacklisted_links = set() self.link_visit_counts = defaultdict(int) # page number -> link self.top_url_tld = get_tld_or_host(top_url) # TLD for the first URL self.base_filename = safe_filename_from_url( top_url.replace("http://", "").replace("https://", "")) self.outdir = join(OUTDIR, self.base_filename) self.png_file_name = join(self.outdir, 'PAGE_NO_URL.png') self.page_src_file_name = join(self.outdir, 'PAGE_NO_URL.html') self.links_json_file_name = join(self.outdir, 'links_%s.json' % self.base_filename) self.visited_links_json_file_name = join( self.outdir, 'visited_links_%s.json' % self.base_filename) self.product_links_file_name = join( self.outdir, 'product_links_%s.txt' % self.base_filename) from selenium.webdriver.chrome.options import Options from selenium import webdriver chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_prefs = {} chrome_options.experimental_options["prefs"] = chrome_prefs chrome_prefs["profile.default_content_settings"] = {"images": 2} from selenium.webdriver.firefox.options import Options self.driver = webdriver.Chrome(options=chrome_options) self.driver.set_page_load_timeout(PAGE_LOAD_TIMEOUT) self.external_link_err_cnt = 0 self.timeout_err_cnt = 0 self.make_site_dir()
def dynamic_vm_values(trace, code_start=BADADDR, code_end=BADADDR, silent=False): """ Find the virtual machine context necessary for an automated static analysis. code_start = the bytecode start -> often the param for vm_func and usually starts right after vm_func code_end = the bytecode end -> bytecode usually a big chunk, so if we identify several x86/x64 inst in a row we reached the end base_addr = startaddr of the jmp table -> most often used offset in the vm_trace vm_addr = startaddr of the vm function -> biggest function in .vmp segment, :param trace: instruction trace :return: vm_ctx -> [code_start, code_end, base_addr, vm_func_addr, vm_funcs] """ base_addr = defaultdict(lambda: 0) vm_addr = find_vm_addr(deepcopy(trace)) trace, vm_seg_start, vm_seg_end = extract_vm_segment(trace) code_addrs = [] # try finding code_start if code_start == BADADDR: code_start = GetFunctionAttr(vm_addr, FUNCATTR_END)#NextHead(GetFunctionAttr(vm_addr, FUNCATTR_END), vm_seg_end) code_start = NextHead(code_start, BADADDR) while isCode(code_start): code_start = NextHead(code_start, BADADDR) for line in trace: # construct base addr dict of offsets -> jmp table should be the one most used if len(line.disasm) == 2: try: offset = re.findall(r'.*:off_([0123456789abcdefABCDEF]*)\[.*\]', line.disasm[1])[0] base_addr[offset] += 1 except: pass # code_start additional search of vm_func params if line.addr == vm_addr: for l in trace[:trace.index(line)]: if l.disasm[0] == 'push': try: arg = re.findall(r'.*_([0123456789ABCDEFabcdef]*)', l.disasm[1]) if len(arg) == 1: code_addrs.append(int(arg[0], 16)) except Exception, e: print e.message
def path_finder_bfs(start: str, end: str, graph: Dict): # Setup visited = defaultdict(int) visited[start] = True deck = deque() for word in graph[start]: deck.append((word, [start])) # Solve for path while deck: word, result = deck.popleft() # Found path if word == end: return result + [end] # Still Searching for next_word in graph[word]: if not visited[next_word]: deck.append((next_word, result + [word])) visited[word] = True # No path found return None
def read_test_files(self, annotation_file): ''' Read files for testing ''' features_test = [] labels_test = [] annotation_file = self.basepath + '/annotations/general/' + annotation_file annotations = self._read_annotations(annotation_file) annotation_dict = defaultdict(list) for anno in annotations: annotation_dict[anno[3]].append(anno) for filename, annos in annotation_dict.items(): path = self.basepath + '/audio/' + filename features, labels = self._read_test_windows(path, annos) features_test.extend(features) labels_test.extend(labels) return features_test, labels_test
def _handle_if_timeouts(self): """ Periodically checks each interface state and issues an if revocation, if no keep-alive message was received for IFID_TOUT. """ if_id_last_revoked = defaultdict(int) while self.run_flag.is_set(): start_time = time.time() with self.ifid_state_lock: for (if_id, if_state) in self.ifid_state.items(): cur_epoch = ConnectedHashTree.get_current_epoch() # Check if interface has timed-out. if ((if_state.is_expired() or if_state.is_revoked()) and (if_id_last_revoked[if_id] != cur_epoch)): if_id_last_revoked[if_id] = cur_epoch if not if_state.is_revoked(): logging.info("IF %d appears to be down.", if_id) self._issue_revocation(if_id) if_state.revoke_if_expired() sleep_interval(start_time, self.IF_TIMEOUT_INTERVAL, "Handle IF timeouts")
def _schedule_all_holidays(self, holiday_ids_to_ignore=()): resolver = DataResolver(None) season_service = services.season_service() current_season_length = season_service.season_length_option drama_scheduler = services.drama_scheduler_service() season_data = defaultdict(list) for (season_type, season_content) in season_service.get_seasons_for_scheduling(): season_data[season_type].append(season_content) for (season, day, holiday_id) in self._holiday_times[ current_season_length].holidays_to_schedule_gen(): if holiday_id in holiday_ids_to_ignore: continue for season_content in season_data[season]: holiday_start_time = season_content.start_time + create_time_span( days=day) drama_scheduler.schedule_node( HolidayService.CUSTOM_HOLIDAY_DRAMA_NODE, resolver, specific_time=holiday_start_time, holiday_id=holiday_id)
def predictLocation(self, tokens_in_tweet): gc_probabilities_for_tweet = defaultdict(float) token_found = False for token in tokens_in_tweet: gc_probabilities_dict_for_token = self.gc_probabilities_dict_for_tokens.get( token) if gc_probabilities_dict_for_token == None: continue else: token_found = True for gcid in gc_probabilities_dict_for_token.keys(): gc_probability_for_token = gc_probabilities_dict_for_token[ gcid] gc_probabilities_for_tweet[ gcid] += gc_probability_for_token * self.inf_gain_ratios[ token] if token_found: return max(gc_probabilities_for_tweet.iteritems(), key=operator.itemgetter(1))[0] else: return self.gcid_with_max_prior
def recommend(user_count, user_dict, K, topN): # rank = defaultdict(int) W = measureSimilarity(user_dict) f = open("result.txt", "w") user_id = 1 while user_id <= user_count: rank = defaultdict( int) #the most important word and easy to write in the wrong site for i, score in user_dict[user_id]: for j, wj in sorted(W[i].items(), key=itemgetter(1), reverse=True)[0:K]: if j in user_dict[user_id]: continue rank[j] += score * wj l = sorted(rank.items(), key=itemgetter(1), reverse=True)[0:topN] print('user_id ' + str(user_id) + ' : ') print(l) for item in l: f.write(str(user_id) + ' | ' + str(item[0])) f.write("\n") user_id += 1
def __init__(self, crawlParams): #self.visited = [] self.visited = {} self.pagesCount = 0 self.priorityQueue = crawlParams['priorityQueue'] self.scorer = crawlParams['scorer'] self.pageScoreThreshold = crawlParams['pageScoreThreshold'] self.urlScoreThreshold = crawlParams['urlScoreThreshold'] self.pagesLimit = crawlParams['num_pages'] #self.mode = crawlParams['mode'] self.restricted = crawlParams['restricted'] self.combineScore = crawlParams['combineScore'] self.pagesDir = crawlParams['pagesDir'] #self.hosts_RelNonRelLists={} self.bufferLen = crawlParams['bufferLen'] self.sourcesImp = defaultdict( lambda: [1., 1.] ) #list contains number of relevant at index 0 and number of non-relevant at index 1 self.siScoreCombineMethod = crawlParams['siScoreCombineMethod'] self.topicWeight = 0.6 self.siWeight = 0.4
def AP_interpolated_buckets(sortedRanks, cutoff, total_ranks): values = list() index_count = defaultdict(lambda: 0) for rank in sortedRanks: for i in range(1, bucket_count + 1): if rank <= i: index_count[i] += 1 for i in range(1, bucket_count + 1): counter = index_count[i] value = counter / (i * (total_ranks / bucket_count)) values.append(value) values.reverse() sum_ = 0 max_ = -1 for value in values: if value > max_: sum_ += value max_ = value else: sum_ += max_ return (sum_ / len(values), len(values))
def output(self): output_level = defaultdict(str) total_pos = 0 def draw_node(i, level): nonlocal total_pos if i >= len(self.datas): return 0 ln = draw_node(2 * i + 1, level + 1) ol_str = output_level[level] for k in range(total_pos - len(ol_str)): ol_str += ' ' ol_str += str(self.datas[i]) output_level[level] = ol_str total_pos += 2 rn = draw_node(2 * i + 2, level + 1) return ln + rn + 1 draw_node(0, 0) for k, v in sorted(output_level.items(), key=lambda e: e[0]): print(v)
def attribute_value_proportion(instances, attribute, attribute_names): '''Returns a defaultdict containing the counts of occurrences and proportion of each value of attribute in the list of instances.attribute_names is the list we created above, where each element is the name of an attribute.''' attribute_value_counts = defaultdict(int) instance_value_counts = {} instance_proportions = {} #find position of attribute in attribute_name position_index = attribute_names.index(attribute) #count occurrences of values in that position in the index list for instance in instances: #save the value of the attribute instance_value = instance[position_index].strip() if instance_value not in instance_value_counts: #add to dictionary, but strip beforehand instance_value_counts[instance_value] = 0 instance_proportions[instance_value] = 0 #increment at dictionary key, make sure you strip beforehand instance_value_counts[instance_value] += 1 instance_proportions[instance_value] = float(instance_value_counts[instance_value]) / len(instances) return instance_proportions
def oppositeSums(arr): origin = arr n = len(arr) result = len(arr) # seen = defaultdict(lambda: 0) # seen_rev = defaultdict(lambda: 0) diff = defaultdict(lambda: 0) for num in arr: rev = int(str(num)[::-1]) sub = num - rev # if num in seen: # result += seen[num] # elif num in seen_rev: # result += seen_rev[num] # seen[num] += 1 # seen_rev[rev] += 1 if sub in diff: result += diff[sub] diff[sub] += 1 return result
def df_to_edges(graph: pd.DataFrame): graph_dict = graph.T.to_dict() edges = defaultdict(list) for src_id, row_dict in graph_dict.items(): src_x = row_dict['x'] src_y = row_dict['y'] for dst_id in row_dict['adj']: dst_x = graph_dict[dst_id]['x'] dst_y = graph_dict[dst_id]['y'] weight = graph_dict[dst_id]['weight'] if not np.isnan(weight): cost = weight * gaversin_distance(src_x, src_y, dst_x, dst_y) else: cost = gaversin_distance(src_x, src_y, dst_x, dst_y) edges[src_id].append((cost, dst_id)) return edges
def solution(N, road, K): answer = 1 cost = defaultdict(lambda: [123456789for _ in range(N+1)]) connection_list = [[] for _ in range(N+1)] table = [123456789 for _ in range(N+1)] need_to_check = deque() answer_set = set() for r in road: if cost[r[0]][r[1]] == 123456789 : connection_list[r[0]].append(r[1]) if cost[r[1]][r[0]] == 123456789 : connection_list[r[1]].append(r[0]) cost[r[0]][r[1]] = min(r[2],cost[r[0]][r[1]]) cost[r[1]][r[0]] = min(r[2],cost[r[1]][r[0]]) for i in connection_list[1]: need_to_check.append(i) table[i] = cost[1][i] if table[i] <= K: if i not in answer_set: answer += 1 answer_set.add(i) answer_set.add(1) while len(need_to_check) != 0 : check = need_to_check.popleft() for i in connection_list[check]: if table[i] > table[check] + cost[check][i]: # update table[i] = table[check] + cost[check][i] need_to_check.append(i) if table[i]<=K: if i not in answer_set: answer += 1 answer_set.add(i) return answer
def get_jar(self, idc=-1): """ Get the content of all files present in the JAR file stored in the field 9.184. The returned dictionnary contains the as follow:: { 'file name': 'file content', ... } The content of the files are not parsed, but returned as string value. :param idc: IDC value. :type idc: int :return: Content of all files stored in the JAR file. :rtype: dict """ idc = self.checkIDC(9, idc) data = self.get_field("9.184", idc) if data != None: data = base64.decodestring(data) buffer = StringIO() buffer.write(data) ret = defaultdict() with zipfile.ZipFile(buffer, "r") as zip: for f in zip.namelist(): name, _ = os.path.splitext(f) with zip.open(f, "r") as fp: ret[name] = fp.read() return dict(ret) else: return None
def _compute(self): tvs = [x.getResult() for x in self._children] tvs = tvs[0:-1] from numpy import array tvStarts = [array(x.startsAsNumpyArray(), dtype='int64') for x in tvs] tvEnds = [array(x.endsAsNumpyArray(), dtype='int64') for x in tvs] numTracks = len(tvStarts) assert numTracks < 34, 'Maximum supported nr. of tracks for this statistic is 33' localBinSize = self._localBinSize binSize = self._binSizeStat.getResult() bins = np.arange(0, binSize, localBinSize) s = [] for track in tvStarts: s.append(len(track)) E = np.sum(s) / float(len(bins)) O = np.zeros((len(bins), 1)) binPositions = [ np.floor_divide(t_starts, localBinSize) for t_starts in tvStarts ] for track in binPositions: for binPos in track: O[binPos, 0] += 1 return O, E if not E > 0: T = 0 else: T = np.sum(np.power((O - E), 2) / E) # print "--------------" + self.__class__.__name__ + "-----------------------" # print self._region, T,E, O r = defaultdict(int) r[0] = T return [T]
def validate(self, classifier): '''Fuert die Kreuzvalidierung mit dem Klassifikator 'classifier' durch. Params: classifier: Objekt, das folgende Methoden implementiert (siehe oben) estimate(train_samples, train_labels) classify(test_samples) --> test_labels Returns: crossval_overall_result: Erkennungsergebnis der gesamten Kreuzvalidierung (ueber alle Folds) crossval_class_results: Liste von Tuple (category, result) die klassenweise Erkennungsergebnisse der Kreuzvalidierung enthaelt. ''' crossval_overall_list = [] crossval_class_dict = defaultdict(list) for fold_index in range(self.n_folds): train_samples, train_labels, test_samples, test_labels = self.samples_fold( fold_index) classifier.estimate(train_samples, train_labels) estimated_test_labels = classifier.classify(test_samples) classifier_eval = ClassificationEvaluator(estimated_test_labels, test_labels) crossval_overall_list.append(list(classifier_eval.error_rate())) crossval_class_list = classifier_eval.category_error_rates() for category, err, n_wrong, n_samples in crossval_class_list: crossval_class_dict[category].append([err, n_wrong, n_samples]) crossval_overall_mat = np.array(crossval_overall_list) crossval_overall_result = CrossValidation.crossval_results( crossval_overall_mat) crossval_class_results = [] for category in sorted(crossval_class_dict.keys()): crossval_class_mat = np.array(crossval_class_dict[category]) crossval_class_result = CrossValidation.crossval_results( crossval_class_mat) crossval_class_results.append((category, crossval_class_result)) return crossval_overall_result, crossval_class_results
def createFasta(input_file, append_file, order): fastaInDict = dict() orderDictSc = OrderedDict() with open(input_file) as FASTAIN, open(append_file, "a") as APP, open(order) as ORD: fastaParse = SeqIO.parse(FASTAIN, "fasta") fastaOutDict = defaultdict(list) for fastaSeq in fastaParse: s = str(fastaSeq.seq) idFasta = fastaSeq.id fastaInDict[idFasta] = s keyMap = { "Smic.scaffold9__1420062__1920061": "Smic.scaffold9__1420062__2138115", "Smic.scaffold236__1__500000": "Smic.scaffold236__1__795886", "Smic.scaffold338__1__500000": "Smic.scaffold338__1__646490", "Smic.scaffold458__1__500000": "Smic.scaffold458__1__544999" } for line in ORD: line = line.rstrip("\n") val = line.split("\t") k = val[0] if (k in keyMap): k = keyMap[k] seq = fastaInDict[k][0:500000] else: seq = fastaInDict[k] fastaOutDict[val[1]].append(seq) orderDictSc[val[1]] = len(seq) for keys in orderDictSc.keys(): chrom = "cluster" + keys print("Writing Chromosome " + str(chrom)) APP.write(">" + chrom + "\n") APP.write("".join(fastaOutDict[keys])) APP.write("\n")
def assignScaffold(input_file, output_file, pos): scDict = defaultdict(list) s = ScaffoldList() count = 0 with open(input_file, "r") as INP, open(output_file, "w") as OUT, open(pos, "r") as PAR: for line in INP: line = line.rstrip("\n") v = line.split("\t") a = v[3].split(":") b = a[1].split("-") s.add(v[0], a[0], b[0], b[1], count, scDict) count += 1 # for line in PAR: # line=line.rstrip("\n") # v=line.split("\t") # b=v[1].split(" ") # pos1=b[1] # pos2=b[3] # side1=b[0]+"-"+pos1 # side2=b[2]+"-"+pos2 # side1Sc=s.search(side1,scDict[b[0]]) # side2Sc=s.search(side2,scDict[b[2]]) # OUT.write(line+"\t"+side1Sc.scaffold+"\t"+side2Sc.scaffold+"\n") for line in PAR: line = line.rstrip("\n") v = line.split(" ") pos1 = v[4] pos2 = v[5] side1 = v[2] + "-" + pos1 side2 = v[2] + "-" + pos2 print("read", v[0]) print("side1", side1) print("side2", side2) side1Sc = s.search(side1, scDict[v[2]]) side2Sc = s.search(side2, scDict[v[2]]) OUT.write(line + "\t" + side1Sc.scaffold + "\t" + side2Sc.scaffold + "\n")
def _write_worklist(self, dest_plate_id, worklist): '''Write worklist.''' worklist_id = dest_plate_id + '_worklist' outfile = os.path.join(self.__outdir, worklist_id + '.csv') writer = csv.writer(open(outfile, 'a+')) worklist_map = defaultdict(list) for entry in sorted(worklist, key=lambda x: x[3]): worklist_map[entry[1]].append(entry) for idx in cycle(range(0, self.__rows * self.__cols)): if worklist_map[idx]: entry = worklist_map[idx].pop(0) writer.writerow([ plate_utils.get_well(val) if idx == 1 or idx == 3 else str(val) for idx, val in enumerate(entry) ]) if not sum([len(lst) for lst in worklist_map.values()]): break
def minNumberOfSemesters(self, n, dependencies, k): """ :type n: int :type dependencies: List[List[int]] :type k: int :rtype: int """ indegree = [0] * (n + 1) graph = defaultdict(list) for edge in dependencies: graph[edge[0]].append(edge[1]) indegree[edge[1]] += 1 print(indegree) print(graph) q = [] for i in range(1, n + 1): if indegree[i] == 0: q.append(i) count = 0 ans = 0 while q: u = q.pop()