def read_tmp_out(tmp_out=None,file_str=None,sample=None): gj.printFuncRun('read_tmp_out') gj.printFuncArgs() fa_dict = read_fa() tx_base_pos_dict = nested_dict(2, list) # {tx:{'A':[pos1,pos2],'T':[]}} base_enrich_dict = nested_dict(1, int) with open(tmp_out, 'r') as TMP_OUT: for line in TMP_OUT: line = line.strip() if not line or line.startswith('#'): continue arr = line.split('\t') transcript_id = arr[0] transcript_len = int(arr[1]) if transcript_len != len(fa_dict[transcript_id]): print "transcirpt length not conistent with reference: %s, tmp_out len: %s, reference len: %s"%(transcript_id, transcript_len, len(fa_dict[transcript_id])) sys.exit() for n,base_enrichment_score in enumerate(arr[4:]): score = base_enrichment_score.split(',')[0] #if score != "NULL" and float(score) != 0 and float(score) >= 0.3: if score != "NULL" and float(score) != 0: base = fa_dict[transcript_id][n] tx_base_pos_dict[transcript_id][base].append(n) base_enrich_dict[base.upper()] += 1 print base_enrich_dict #val_ls = [base_enrich_dict[i] for i in ['A','T','C','G']] #gj.plot_ls_pie(labels=['A','T','C','G'],val=val_ls,dic="",title_str="",file_str=file_str) TXT = open(file_str, 'w') for i,j in base_enrich_dict.items(): print >>TXT,i+'\t'+str(j) TXT.close() gj.printFuncRun('read_tmp_out')
def degree_hist(dg_txt=None): if dg_txt is None: dg_txt = '/Share/home/zhangqf7/gongjing/zebrafish/data/paris/shi-zp-5-rep-combine/27-DG.txt' dg_dict = read_dg_txt(dg_txt) trans_dict = loadTransGtfBed2() RRI_dict = nested_dict(2, list) for i,j in dg_dict.items(): if j['lchr'] != j['rchr']: RRI_dict[j['RRI_type']][j['lchr']].append(j['rchr']) RRI_dict[j['RRI_type']][j['rchr']].append(j['lchr']) else: RRI_dict[j['RRI_type']][j['lchr']].append(j['rchr']) for i in ['inter', 'intra']: savefn = dg_txt.replace('.txt', '.%s.degree.txt'%(i)) degree_ls_ls = [[], [], []] with open(savefn, 'w') as SAVEFN: for k,v in RRI_dict[i].items(): print >>SAVEFN, '\t'.join(map(str, [ k, trans_dict[k]['type'], len(v), len(set(v)), ','.join(list(set(v))) ])) degree_ls_ls[0].append(len(set(v))) if trans_dict[k]['type'] == 'mRNA': degree_ls_ls[1].append(len(set(v))) if trans_dict[k]['type'] == 'lncRNA': degree_ls_ls[2].append(len(set(v))) degree_mean_ls = [np.mean(i) for i in degree_ls_ls] gj.cumulate_dist_plot(ls_ls=degree_ls_ls,ls_ls_label=['%s, mean=%.2f'%(i,j) for i,j in zip(['all', 'mRNA', 'lncRNA'], degree_mean_ls)], bins=40,title='degree distribution',ax=None,savefn=savefn.replace('.txt', '.pdf'),xlabel='log2(# of interacting partners)',ylabel=None,add_vline=None,add_hline=None,log2transform=1,xlim=None,ylim=None)
def read_dg_txt(dg_txt=None, support=3, filter_rRNA=True, only_mRNA_lncRNA=True): if dg_txt is None: dg_txt = '/Share/home/zhangqf7/gongjing/zebrafish/data/paris/shi-zp-4/7-DG.txt' dg_dict = nested_dict() with open(dg_txt, 'r') as DG: for line in DG: line = line.strip() if line.startswith('#'): header_ls = line.replace('#', '').split('\t') continue if not line: continue arr = line.split('\t') if int(arr[9]) < support: continue if (filter_rRNA and arr[13] == 'rRNA') or (filter_rRNA and arr[14] == 'rRNA'): continue if only_mRNA_lncRNA and arr[13] not in ['mRNA', 'lncRNA']: continue if only_mRNA_lncRNA and arr[14] not in ['mRNA', 'lncRNA']: continue for i,j in zip(header_ls, arr): dg_dict[arr[0]][i] = j if arr[1] == arr[5]: dg_dict[arr[0]]['RRI_type'] = 'intra' else: dg_dict[arr[0]]['RRI_type'] = 'inter' print "DG num: %s, file: %s"%(len(dg_dict), dg_txt) return dg_dict.to_dict()
def get_merchant_db_data(store, numdays): start, end = get_sliding_date_range(numdays) merchant_report = MerchantDbReport(store, ["marketingSource","keyword"], ["revenue", "order_count"], ["add", "add"], None, time_aggregation=None) merchant_data = merchant_report.get_data(mode=None, length=None, start_date=start, end_date=end) entries = merchant_data["entries"] #vurve_sales = filter(lambda x: x["dimensions"]["marketingSource"]["value"] == "Vurve", entries) #nonvurve_sales = filter(lambda x: x["dimensions"]["marketingSource"]["value"] != "Vurve", entries) keyword_rev_data = nested_dict() for e in entries: for kw in e['dimensions']['keyword']['value']: keyword_rev_data[kw.lower()]['marketingSource'] = e["dimensions"]["marketingSource"]["value"] keyword_rev_data[kw.lower()]['revenue'] = e["metrics"]["revenue"]["value"] keyword_rev_data[kw.lower()]['order_count'] = e["metrics"]["order_count"]["value"] """ for entry in vurve_sales: for kw in entry["dimensions"]["keyword"]["value"]: keyword_rev_data[kw.lower()]["vurve"] = entry["metrics"]["revenue"]["value"] for entry in nonvurve_sales: for kw in entry["dimensions"]["keyword"]["value"]: keyword_rev_data[kw.lower()]["nonvurve"] = entry["metrics"]["revenue"]["value"] """ return keyword_rev_data
def readDGFrameFile(filename, interRRI_norRNA=1, support_read=3): fn_stat_dict = nested_dict() inter, intra = 0, 0 with open(filename, 'r') as TXT: for line in TXT: line = line.strip() if not line or line.startswith('#'): continue arr = line.split('\t') if arr[1] == arr[5]: intra += 1 else: inter += 1 fn_stat_dict['inter'] = inter fn_stat_dict['intra'] = intra fn_stat_dict['all'] = intra + inter df = pd.read_csv(filename, header=0, sep='\t') df['type'] = ['intra' if i == j else 'inter' for i,j in zip(df['lchr'], df['rchr'])] df_inter_RRI = df[df['type']=='inter'] nx_inter_RRI = nx.from_pandas_dataframe(df_inter_RRI, 'lchr', 'rchr') fn_stat_dict['uniq RRI'] = len(nx_inter_RRI.edges()) if interRRI_norRNA: df_inter_RRI = df_inter_RRI[(df_inter_RRI['ltype'].isin(['mRNA', 'lncRNA'])) & (df_inter_RRI['rtype'].isin(['mRNA', 'lncRNA']))] df_inter_RRI = df_inter_RRI[df_inter_RRI['support']>=support_read] nx_inter_RRI = nx.from_pandas_dataframe(df_inter_RRI, 'lchr', 'rchr') nx_inter_RRI_info_dict, G_largest = RRI_network_property2(nx_inter_RRI) for i,j in nx_inter_RRI_info_dict.items(): fn_stat_dict[i] = j # fn_stat_df['sampling'] = '' fn_stat_df = pd.DataFrame(fn_stat_dict, index=[0]) return fn_stat_df
def main(): # sample_ls = ['egg_cell1', 'cell1_cell4', 'cell4_cell64', 'cell64_sphere', 'sphere_shield'] sample_ls = ['egg_cell1_egg_cell1', 'cell1_cell4_cell1_cell4', 'cell4_cell64_cell4_cell64', 'cell64_sphere_cell64_sphere', 'sphere_shield_sphere_shield'] all_sample = generate_sample(sample_ls) all_sample_d = nested_dict(2, list) # save_dir = '/Share/home/zhangqf7/gongjing/zebrafish/result/dynamic_merge_region/005_005_new/abs/mergepeaks_d10' save_dir = '/Share/home/zhangqf7/gongjing/zebrafish/result/dynamic_merge_region/005_005_new/abs/new_mergepeaks_d10' for sample in all_sample: bed = '%s/%s'%(save_dir, sample) d = read_bed(bed) for i,j in d.items(): for m,n in j.items(): for tx_start_end in n: all_sample_d[i][m].append(tx_start_end) print len(all_sample_d['egg_cell1/window-anno.bed']), len(all_sample_d['egg_cell1/window-anno.bed'][1]), len(all_sample_d['egg_cell1/window-anno.bed'][4]) for i,j in all_sample_d.items(): savefn = '%s/separate/%s.bed'%(save_dir, i.split('/')[0]) with open(savefn, 'w') as SAVEFN: for m,n in j.items(): for tx_start_end in n: print >>SAVEFN, tx_start_end.replace('|', '\t') for way in range(1, len(sample_ls)+1): way_ls = [] for i,j in all_sample_d.items(): for m,n in j.items(): if m == way: for tx_start_end in n: way_ls.append(tx_start_end) savefn = '%s/separate/way%s.bed'%(save_dir, way) with open(savefn, 'w') as SAVEFN: for tx_start_end in set(way_ls): print >>SAVEFN, tx_start_end.replace('|', '\t')
def buildTree(fname, skipdash=False): aTree = nested_dict() hdl = open(fname, 'r') for i, line in enumerate(hdl): cols = line.strip().split('\t') cid = cols.pop(0) if skipdash: cols = filter(lambda x: x != '-', cols) else: cols = map(lambda x: 'null' if x == '-' else x, cols) if len(cols) == 0: continue branch = {'id': cid, 'depth': len(cols)} for c in reversed(cols): branch = {c: branch} aTree.update(nested_dict(branch)) hdl.close() return aTree.to_dict(), i
def make_gff3(base_name, args, mapping_dict): """make gff3 output of blast hits for jbrowse display""" DMP_dict = nested_dict() for context in ['cg','chg','chh']: file = os.path.join(args.outputdir,'%s.%s.pickle' % (base_name, context)) with open(file) as input_handle: dict_entry = nested_dict(pickle.load(input_handle)) DMP_dict.update(dict_entry) gff3_dir = os.path.join(args.outputdir,'gff3') if not os.path.exists(gff3_dir): os.mkdir(gff3_dir) gff3_output = open(os.path.join(gff3_dir, base_name.replace('(','').replace(')','').replace('.','') + '.gff3'), 'w') gff3_output.write('##gff-version 3.2.1\n') for gene, subdict in DMP_dict.items(): contig = None for position,subdict in sorted(subdict.items()): if subdict['original_contig'] != contig: contig = subdict['original_contig'] concat_contig, concat_start_pos, concat_end_pos = mapping_dict[contig] gff3_output.write('##sequence-region %s %s %s\n' % (concat_contig, concat_start_pos, concat_end_pos)) out_line = [] out_line.append('%(concatenated_contig)s' % subdict) #1 seqid out_line.append('RnBeads_%(context)s' % subdict) #2 source out_line.append('5_methylcytosine') #3 type 5_methylcytosine see http://www.sequenceontology.org/browser/current_svn/term/SO:0001918 out_line.append(str(int('%(concat_contig_pos)s' % subdict))) #4 start out_line.append(str(int('%(concat_contig_pos)s' % subdict))) #5 end out_line.append('%(diffmeth.p.val)s' % subdict) #6 score out_line.append('%(Strand)s' % subdict) #6 strand out_line.append('0') #6 Phase . or 0 attributes = 'ID=%(context)s_%(combinedRank)s;' % subdict subdict['diffmeth.p.val'] = '%.2e' % float(subdict['diffmeth.p.val']) subdict['diffmeth.p.adj.fdr'] = '%.2e' % float(subdict['diffmeth.p.adj.fdr']) attributes += 'Name=%(mean.diff)s p-value:%(diffmeth.p.val)s str:(%(Strand)s);' % (subdict) #7 Attributes, start with unique ID attributes += 'Description=FDR-adjusted p-value:%(diffmeth.p.adj.fdr)s;' % subdict #7 Attributes, start with unique ID attributes += 'Alias=%(concatenated_contig)s_%(concat_contig_pos)s;' % subdict #7 Attributes, start with unique ID attributes += 'Ontology_term=SO:0001918;' % subdict #7 Attributes, start with unique ID for k,v in subdict.items(): k = k[0].upper() + k[1:] attributes += '%s=%s;' % (k,v) attributes = attributes[:-1] out_line.append(attributes) #6 Phase . or 0 gff3_output.write('\t'.join(out_line) + '\n') os.system('bgzip -f %s' % (os.path.join(gff3_dir, base_name.replace('(','').replace(')','').replace('.','') + '.gff3')) ) os.system('tabix -p gff %s.gz' % (os.path.join(gff3_dir, base_name.replace('(','').replace(')','').replace('.','') + '.gff3')) )
def update_leaf(initial_dict, update_dict): """Smart update of dictionaries, by leaf. TODO Not implemented yet. Returning simple dict update. IDEA: Make tuple dict where tuple is made of keys and value is the last node. It will update recursivlely, only when the leaf is found. For example: a = {'a':1, 'b':{'c':2, 'd':'3'}} b = {'a':2, 'b':{'c':3, 'e':'3'}} c = update_leafs(a,b) c = {'a':2, 'b':{'c':3,'e':'3','d':'3'}} """ import nested_dict as nd nd_initial_dict = nd.nested_dict(initial_dict) nd_update_dict = nd.nested_dict(update_dict) nd_initial_dict.update(nd_update_dict) return nd_initial_dict.to_dict()
def bed_coordinate_conversion(genome_coordinate_bed=None, trans_bed=None, genome_bed=None, write_wig=1): if genome_bed is None: genome_bed = trans_bed.replace('.bed', '.genome.bed') print "ref genome bed: %s"%(genome_coordinate_bed) print "input trans bed: %s"%(trans_bed) print "output genome bed: %s"%(genome_bed) Parser = ParseTransClass(genomeCoorBedFile = genome_coordinate_bed) GENOME_BED = open(genome_bed, 'w') convert_dict = nested_dict(2, list) # chr:['start'],['end'],['score'] with open(trans_bed, 'r') as TRANS_BED: for line in TRANS_BED: line = line.strip() if not line or line.startswith(('#', 'track')): continue arr = line.split('\t') trans_id = arr[0] start = int(arr[1])+1 # need 1-based end = int(arr[2]) score = float(arr[4]) # print trans_id,start convert_ls = Parser.transCoor2geneCoor(trans_id, start, end) # output also 1-based """ >>> Parser.transCoor2geneCoor("ENSDART00000168451", 1, 1) [['1', 18716, 18716, 'ENSDARG00000102097', 1, 1]] """ """ for i in convert_ls: arr[0] = i[0] arr[1] = i[1] arr[2] = i[2] arr[3] = i[3] print >>GENOME_BED, '\t'.join(map(str, arr)) """ arr[0] = convert_ls[0][0] arr[1] = convert_ls[0][1]-1 arr[2] = convert_ls[0][2] arr[3] = ','.join([i[3] for i in convert_ls]) print >>GENOME_BED, '\t'.join(map(str, arr)) convert_dict[arr[0]]['start'].append(arr[1]) convert_dict[arr[0]]['end'].append(arr[2]) convert_dict[arr[0]]['score'].append(arr[4]) GENOME_BED.close() if write_wig: wig = trans_bed.replace('.bed', '.genome.wig') with open(wig, 'w') as WIG: print >>WIG, 'track type=wiggle_0' for i,j in convert_dict.items(): print >>WIG, 'variableStep chrom=%s span=1'%(i) start_ls, score_ls = zip(*sorted(zip(j['start'], j['score']))) for start,score in zip(start_ls, score_ls): print >>WIG, '\t'.join(map(str, [start+1, score]))
def read_dir(dir='/Share/home/zhangqf7/gongjing/zebrafish/data/paris/shi-zp-5-rep-combine/downsampling_N', to_dgframe=0, get_inter_intra=1, read_nx=1, interRRI_norRNA=1, support_read=3): fn_ls = os.listdir(dir) # print fn_ls fn_stat_dict = nested_dict() downsampling_N_draw = dir + '.subnetwork.draw.pdf' fig,ax=plt.subplots(10,1) for n,fn in enumerate(fn_ls): print "process: %s"%(fn) dfFile = dir + '/' + fn + '/' + '27-DG' frameFile = dfFile + '.txt' if to_dgframe: paris_dg2frame.DG2Frame(dfFile=dfFile, frameFile=frameFile) if get_inter_intra: inter, intra = 0, 0 with open(frameFile, 'r') as TXT: for line in TXT: line = line.strip() if not line or line.startswith('#'): continue arr = line.split('\t') if arr[1] == arr[5]: intra += 1 else: inter += 1 fn_stat_dict[fn]['inter'] = inter fn_stat_dict[fn]['intra'] = intra fn_stat_dict[fn]['all'] = intra + inter if read_nx: df = pd.read_csv(frameFile, header=0, sep='\t') df['type'] = ['intra' if i == j else 'inter' for i,j in zip(df['lchr'], df['rchr'])] df_inter_RRI = df[df['type']=='inter'] nx_inter_RRI = nx.from_pandas_dataframe(df_inter_RRI, 'lchr', 'rchr') fn_stat_dict[fn]['uniq RRI'] = len(nx_inter_RRI.edges()) if interRRI_norRNA: df_inter_RRI = df_inter_RRI[(df_inter_RRI['ltype'].isin(['mRNA', 'lncRNA'])) & (df_inter_RRI['rtype'].isin(['mRNA', 'lncRNA']))] df_inter_RRI = df_inter_RRI[df_inter_RRI['support']>=support_read] nx_inter_RRI = nx.from_pandas_dataframe(df_inter_RRI, 'lchr', 'rchr') nx_inter_RRI_info_dict, G_largest = RRI_network_property2(nx_inter_RRI) for i,j in nx_inter_RRI_info_dict.items(): fn_stat_dict[fn][i] = j # fn_stat_dict[fn]['uniq RRI'] = len(nx_inter_RRI.edges()) if n < 10: draw_graph(G_largest, ax=ax[n]) plt.savefig(downsampling_N_draw) savefn = dir + '.stat.txt' fn_stat_df = pd.DataFrame.from_dict(fn_stat_dict) fn_stat_df = fn_stat_df.T fn_stat_df['sampling'] = fn_stat_df.index print fn_stat_df.head() fn_stat_df.to_csv(savefn, header=True, index=False, sep='\t') return fn_stat_df
def match_model(edit): '''Given an edit sequence, look up the likelihood''' modelmatch = nested_dict(pickle.load(open("resources/similarity_model.p", "rb"))) keys = {'delete': 0.0, 'insert': 0.0, 'replace': 0.0} d = dict(Counter([e[0] for e in edit])) for k,v in d.iteritems(): keys[k] = v p = modelmatch[keys['delete']][keys['insert']][keys['replace']] if isinstance(p, float): return p else: return 0
def get_adwords_data(store, numdays): start, end = get_sliding_date_range(numdays) adwords_report = AdwordsReport(store, ["kwSite"], ["cost", "qualityScore"], ["add", "average"], None, time_aggregation=None) report = adwords_report.get_data(mode=None, length=None, start_date=start, end_date=end) spend_data = nested_dict() for entry in report["entries"]: kw = entry["dimensions"]["kwSite"]["value"].lower() spend_data[kw]['cost'] = entry["metrics"]["cost"]["value"] spend_data[kw]['quality'] = entry["metrics"]["qualityScore"]["value"] return spend_data
def get_groups(args): """get groups defined in sample file""" group_dict = nested_dict() with open(args.samples) as handle: header = handle.readline().rstrip('\n').split(',') for line in handle: split_line = line.rstrip('\n').split(',') sample = split_line[0] for name,item in zip(header[2:],split_line[2:]): try: group_dict[name][item].append(sample) except AttributeError: group_dict[name][item] = [sample] return group_dict
def all_gini(RPKM_combine=None, mode='gini', null_pct=1): if RPKM_combine is None: RPKM_combine = '/Share/home/zhangqf7/gongjing/zebrafish/result/RPKMCorrelationPairwiseNew/2018_03_12_gini/RPKM_combine.merge.txt' df = pd.read_csv(RPKM_combine, header=None, index_col=0, sep='\t') print df.head() trans_dict = loadTransGtfBed2('/Share/home/zhangqf7/gongjing/zebrafish/data/reference/transcriptome/danRer10.refSeq.transcriptome.trans.bed2') sample_ls = ['egg', '1cell', '4cell', '64cell', 'sphere', 'shield'] sample_path = ['/Share/home/zhangqf7/gongjing/zebrafish/data/icSHAPE_final_out_new_win/%s.icshape.w200.s30.T2.t200.out'%(i) for i in sample_ls] # norm by window sample_ic_dict = nested_dict() for i,j in zip(sample_path, sample_ls): print "read icshape: %s"%(i) sample_ic_dict[j] = readIc(i) t_cutoff = sample_path[0].split('.')[-2] savefn = RPKM_combine.replace('.txt', '.%s.%s.null%s.txt'%(t_cutoff, mode, int(null_pct*100))) SAVEFN = open(savefn, 'w') print >>SAVEFN, '\t'.join(['%s(transcript)\t%s(UTR5)\t%s(CDS)\t%s(UTR3)'%(i,i,i,i) for i in sample_ls]) for tx in df.index: gini_ls = [tx] for i in sample_ls: if sample_ic_dict[i].has_key(tx) and trans_dict.has_key(tx): utr_5_start, utr_5_end, cds_start, cds_end, utr_3_start, utr_3_end = [int(trans_dict[tx][g]) for g in ['utr_5_start', 'utr_5_end', 'cds_start', 'cds_end', 'utr_3_start', 'utr_3_end']] if utr_5_start == 0: utr_5_gini = 'NULL' else: utr_5_gini = gini(sample_ic_dict[i][tx][(utr_5_start-1):(utr_5_end)], mode=mode, null_pct=null_pct) if utr_5_gini < 0: utr_5_gini = 'NULL' if utr_3_start == 0: utr_3_gini = 'NULL' else: utr_3_gini = gini(sample_ic_dict[i][tx][(utr_3_start-1):(utr_3_end)], mode=mode, null_pct=null_pct) if utr_3_gini < 0: utr_3_gini = 'NULL' cds_gini = gini(sample_ic_dict[i][tx][(cds_start-1):(cds_end)], mode=mode, null_pct=null_pct) if cds_gini < 0: cds_gini = 'NULL' transcript_gini = gini(sample_ic_dict[i][tx][0:], mode=mode, null_pct=null_pct) if transcript_gini < 0: transcript_gini = 'NULL' sample_gini_ls = [transcript_gini, utr_5_gini, cds_gini, utr_3_gini] else: sample_gini_ls = ['NULL','NULL', 'NULL', 'NULL'] gini_ls += sample_gini_ls print >> SAVEFN, '\t'.join(map(str, gini_ls)) SAVEFN.close()
def loadTransGtfBed2(ref_bed='/Share/home/zhangqf7/gongjing/zebrafish/data/reference/gtf/ref_GRCz10_top_level.trans.bed.2'): H = open(ref_bed) line = H.readline() trans_dict = nested_dict() header_ls = ['tx', 'gene', 'type', 'length', 'utr_5_start', 'utr_5_end', 'cds_start', 'cds_end', 'utr_3_start', 'utr_3_end'] while line: if line.startswith('#'): line = H.readline(); continue arr = line.strip().split() for i,j in zip(header_ls, arr): trans_dict[arr[0]][i] = j line = H.readline() H.close() print "read: %s, n=%s"%(ref_bed, len(trans_dict)) return trans_dict.to_dict()
def read_maternal(maternal_list_file=None): if maternal_list_file is None: matenal = '/Share/home/zhangqf7/gongjing/zebrafish/data/maternal_gene/maternal-decay.txt' else: matenal = maternal_list_file matenal_gene_dict = nested_dict(1, int) with open(matenal, 'r') as IN: for line in IN: line = line.strip() if not line or line.startswith('value'): continue arr = line.split('\t') matenal_gene_dict[arr[0]] += 1 return matenal_gene_dict.to_dict()
def read_rpkm(rpkm_txt=None): if rpkm_txt is None: rpkm_txt = '/Share/home/zhangqf7/gongjing/zebrafish/data/RPKM/DMSO_1cell_rep1' rpkm_dict = nested_dict() with open(rpkm_txt, 'r') as TXT: for line in TXT: line = line.strip() if not line: continue if line.startswith('#'): header = line.replace('#', '').split('\t') continue arr = line.split('\t') for i,j in zip(header, arr): rpkm_dict[arr[0]][i] = j return rpkm_dict.to_dict()
def loadTransGtfBed2(ref_bed='/Share/home/zhangqf7/gongjing/mes/ref/mm10.transCoor.bed.2'): H = open(ref_bed) line = H.readline() trans_dict = nested_dict() header_ls = ['tx', 'gene', 'type', 'length', 'utr_5_start', 'utr_5_end', 'cds_start', 'cds_end', 'utr_3_start', 'utr_3_end'] while line: if line.startswith('#'): line = H.readline(); continue arr = line.strip().split('\t') gene = arr[1].split('=')[0].split()[0] for i,j in zip(header_ls, arr): trans_dict[arr[0]][i] = j line = H.readline() H.close() print "read: %s, n=%s"%(ref_bed, len(trans_dict)) return trans_dict.to_dict()
def read_bed(bed): print "load: %s"%(bed) d = nested_dict(2, list) with open(bed, 'r') as BED: for line in BED: if not line or line.startswith('#'): continue arr = line.split('\t') tx_start_end = '|'.join(arr[1:4]) sample_ls = arr[6].split('|') for sample in sample_ls: d[sample][len(sample_ls)].append(tx_start_end) d = d.to_dict() for sample in sample_ls: print sample, len(d[sample][len(sample_ls)]), len(sample_ls) return d
def file_info(file_dir=None, result_dir=None): if file_dir is None: file_dir = '/Share/home/zhangqf7/gongjing/zebrafish/data/RT' if result_dir is None: result_dir = '/Share/home/zhangqf7/gongjing/zebrafish/result/RTCorrelationPairwise' files = os.listdir(file_dir) NAI_files = [i for i in files if i.startswith('NAI')] DMSO_files = [i for i in files if i.startswith('DMSO')] paths = [file_dir+'/'+i for i in files] file_info_dict = nested_dict() file_info_dict['file_dir'] = file_dir file_info_dict['files'] = files file_info_dict['paths'] = paths file_info_dict['result_dir'] = result_dir return file_info_dict.to_dict()
def pickle_gff3_entry(in_file, out_file, args, mapping_dict): """pickle gff3 file entry with DMPs for merging""" DMP_dict = nested_dict() with open(in_file,'r') as in_handle: header = in_handle.readline()[:-1].split('\t') context = in_file.split('.')[-2] for line in in_handle: split_line = line[:-1].split('\t') content = {'context':context} for k,v in zip(header,split_line): if ',' in v: v = float(v.replace(',','.')) content[k] = v try: if content['diffmeth.p.val']<= float(args.treshold): DMP_dict[content['concatenated_contig']][int(content['concat_contig_pos'])] = content except ValueError: continue with open(out_file,'wb') as out_handle: pickle.dump(DMP_dict.to_dict(),out_handle,2)
def wur_buxton_diff(header, split_line): """calculates average difference in meth between WUR and buxton""" meth_values = nested_dict() for k,v in zip(header,split_line): try: location,sample,type = k.split('_') if sample in groups: meth_values[groups[sample]][location][sample][type] = v except ValueError: pass for group in meth_values.keys(): buxton_meth_values = [] for ind, meth_dict in meth_values[group]['BUXTON'].items(): try: buxton_meth_values.append(int(meth_dict['methylated']) / float(meth_dict['total'])) except TypeError: buxton_meth_values.append(None) except ValueError: buxton_meth_values.append(None) wur_meth_values = [] for ind, meth_dict in meth_values[group]['WUR'].items(): try: wur_meth_values.append(int(meth_dict['methylated']) / float(meth_dict['total'])) except TypeError: wur_meth_values.append(None) except ValueError: wur_meth_values.append(None) diff = [a - b for a,b in zip(buxton_meth_values, wur_meth_values) if a and b] if diff == []: abs_diff = None rel_diff = None else: abs_diff = sum([abs(v) for v in diff]) / float(len(diff)) rel_diff = sum(diff) / float(len(diff)) meth_values[group]['abs_diff'] = abs_diff meth_values[group]['rel_diff'] = rel_diff return meth_values
def json2VerticaTable(tableName, jsonData, cur, keep_keys,extend_cols,extend_vals): """insert choosed {key,value} of data into Vertica for each Batch""" i = 0 for r in jsonData: # since >=2 level, the 2nd level's data will be a problem columns = [] values = [] nd = nested_dict(r) for keys_as_tuple, value in nd.items_flat(): con_keys = "_".join(str(x) for x in keys_as_tuple) if con_keys in keep_keys: columns.append(con_keys) values.append(str(value).replace("'", "''")) columns.extend(extend_cols) values.extend(extend_vals) # print("**************this is the columns******************") # print columns # print("**************this is the values******************") # print values if i == 0: # CREATE TABLE foo (numbs int, names varchar(30)) createTableSQL = "CREATE TABLE IF NOT EXISTS " + tableName + " (" for k in columns: createTableSQL = createTableSQL + k + " varchar(200)," # remove the last , and add 0 createTableSQL = createTableSQL[:-1] + ")" # print(createTableSQL) cur.execute(createTableSQL) i = i + 1 else: columnsStr = ','.join(columns) # print(columnsStr) # TypeError: sequence item 6: expected string or Unicode, bool found valuesStr = ','.join(["'" + str(x) + "'" for x in values]) # print(valuesStr) insertSQL = "INSERT INTO %s (%s) VALUES (%s)" % (tableName, columnsStr, valuesStr) # print(insertSQL) cur.execute(insertSQL) i = i + 1
def get_stats(args): """get stats based on bam file""" mapping_dict = nested_dict.nested_dict() try: handle = pysam.AlignmentFile(args.input, 'rb') except OSError: print 'error' #Samples can be added from several lanes, which will results in different read groups #in order to only account for samples here, make a dict mapping RG_ID to sample RG_to_sample = dict([(r['ID'],r['SM']) for r in handle.header['RG']]) count = 0 for read in handle: count += 1 if not count%1000000: print '%s reads processed' % count if not read.is_duplicate and not read.is_qcfail: #make dict of read tag objects tag_dict = dict(read.tags) sample = RG_to_sample[tag_dict['RG']] #add count of valid read tot total for this sample try: mapping_dict['total'][sample] += 1 except TypeError: mapping_dict['total'][sample] = 1 if 'mono' in sample: if sample.replace(' ','_') not in read.reference_name: try: if read.reference_name not in mapping_dict['discard']: mapping_dict['discard'].append(read.reference_name) except AttributeError: mapping_dict['discard'] = [read.reference_name] except KeyError: mapping_dict['discard'] = [read.reference_name] try: mapping_dict[read.reference_name][sample] += 1 except TypeError: mapping_dict[read.reference_name][sample] = 1 return mapping_dict
def test_default(self): """ test a range of nested_dict """ from nested_dict import nested_dict nd = nested_dict() nd['new jersey']['mercer county']['plumbers'] = 3 nd['new jersey']['mercer county']['programmers'] = 81 nd['new jersey']['middlesex county']['programmers'] = 81 nd['new jersey']['middlesex county']['salesmen'] = 62 nd['new york']['queens county']['plumbers'] = 9 nd['new york']['queens county']['salesmen'] = 36 expected_result = sorted([(('new jersey', 'mercer county', 'plumbers'), 3), (('new jersey', 'mercer county', 'programmers'), 81), (('new jersey', 'middlesex county', 'programmers'), 81), (('new jersey', 'middlesex county', 'salesmen'), 62), (('new york', 'queens county', 'plumbers'), 9), (('new york', 'queens county', 'salesmen'), 36)]) all = sorted(tup for tup in nd.iteritems_flat()) self.assertEqual(all, expected_result) all = sorted(tup for tup in nd.items_flat()) self.assertEqual(all, expected_result)
def classify_using_naive_bays(titles, dict_titles_testing, dict_priors, words_dict): title_score_dict = nd.nested_dict() # counter_right = 0 # counter_wrong = 0 for title in titles: score_dict = {} words = regex_filtering(title.lower()) if dict_priors['story'] == 0: score_story = float('-inf') else: score_story = math.log10(dict_priors['story']) if dict_priors['ask_hn'] == 0: score_ask_hn = float('-inf') else: score_ask_hn = math.log10(dict_priors['ask_hn']) if dict_priors['show_hn'] == 0: score_show_hn = float('-inf') else: score_show_hn = math.log10(dict_priors['show_hn']) if dict_priors['poll'] == 0: score_poll = float('-inf') else: score_poll = math.log10(dict_priors['poll']) for word in words: if word in words_dict.keys( ): # should we do something if word is not in model or just skip it? score_story += math.log10(words_dict[word]['P(w|story)']) score_ask_hn += math.log10(words_dict[word]['P(w|ask_hn)']) score_show_hn += math.log10(words_dict[word]['P(w|show_hn)']) score_poll += math.log10(words_dict[word]['P(w|poll)']) score_dict['story'] = score_story score_dict['ask_hn'] = score_ask_hn score_dict['show_hn'] = score_show_hn score_dict['poll'] = score_poll max_value = max(score_dict.values()) # maximum value max_keys = [k for k, v in score_dict.items() if v == max_value ] # getting all keys containing the `maximum` estimated_post_type = max_keys[0] title_score_dict[title]['estimation'] = estimated_post_type if estimated_post_type == dict_titles_testing[title]: title_score_dict[title][ 'original_post_type'] = dict_titles_testing[title] + ' right' # counter_right += 1 else: title_score_dict[title][ 'original_post_type'] = dict_titles_testing[title] + ' wrong' # counter_wrong += 1 title_score_dict[title]['score_story'] = score_story title_score_dict[title]['score_ask_hn'] = score_ask_hn title_score_dict[title]['score_show_hn'] = score_show_hn title_score_dict[title]['score_poll'] = score_poll # print(counter_right) # print(counter_wrong) return title_score_dict
def create_synaptic_input(self, **keywords): """ Calculate synaptic input of populations and areas using the spike data. Uses function ah.pop_synaptic_input. If the synaptic inputs have previously been stored with the same parameters, they are loaded from file. Parameters ---------- t_min : float, optional Minimal time in ms of the simulation to take into account for the calculation. Defaults to 500 ms. t_max : float, optional Maximal time in ms of the simulation to take into account for the calculation. Defaults to the simulation time. areas : list, optional Which areas to include in the calculcation. Defaults to all loaded areas. pops : list or {'complete'}, optional Which populations to include in the calculation. If set to 'complete', all populations the respective areas are included. Defaults to 'complete'. kernel : {'gauss_time_window', 'alpha_time_window', 'rect_time_window'}, optional Convolution kernel for the calculation of the underlying firing rates. Defaults to 'binned' which corresponds to a simple histogram. resolution: float, optional Width of the convolution kernel. Specifically it correponds to: - 'binned' : bin width of the histogram - 'gauss_time_window' : sigma - 'alpha_time_window' : time constant of the alpha function - 'rect_time_window' : width of the moving rectangular function """ default_dict = { 'areas': self.areas_loaded, 'pops': 'complete', 'resolution': 1., 'kernel': 'binned' } params = ah._create_parameter_dict(default_dict, self.T, **keywords) # Check if synaptic inputs have been stored with the same parameters iterator_areas = ah.model_iter(mode='single', areas=params['areas'], pops=None) iterator_pops = ah.model_iter(mode='single', areas=params['areas'], pops=params['pops']) fp = os.path.join(self.output_dir, 'synaptic_input') self.synaptic_input = ah._check_stored_data(fp, copy(iterator_areas), params) fp = os.path.join(self.output_dir, 'synaptic_input_pops') self.synaptic_input_pops = ah._check_stored_data( fp, copy(iterator_pops), params) if self.synaptic_input is None: print('Computing rate time series') if 'rate_time_series' not in inspect.getmembers(self): self.create_rate_time_series(**params) d_pops = nested_dict() d_pops['Parameters'] = params for area, pop in copy(iterator_pops): if pop in self.network.structure[area]: if 'I' in pop: tau_syn = self.network.params['neuron_params'][ 'single_neuron_dict']['tau_syn_in'] else: tau_syn = self.network.params['neuron_params'][ 'single_neuron_dict']['tau_syn_ex'] time_series = ah.synaptic_output( self.rate_time_series_pops[area][pop], tau_syn, params['t_min'], params['t_max'], resolution=params['resolution']) d_pops[area][pop] = time_series self.synaptic_output_pops = d_pops.to_dict() d_pops = nested_dict() d_pops['Parameters'] = params d_pops['Parameters'] = params for area, pop in iterator_pops: if pop in self.network.structure[area]: time_series = np.zeros( int((params['t_max'] - params['t_min']) / params['resolution'])) for source_area, source_pop in ah.model_iter( mode='single', areas=self.areas_loaded): if source_pop in self.network.structure[source_area]: weight = self.network.W[area][pop][source_area][ source_pop] time_series += ( self.synaptic_output_pops[source_area] [source_pop] * abs(weight) * self.network.K[area][pop][source_area] [source_pop]) d_pops[area][pop] = time_series d = nested_dict() d['Parameters'] = params for area in params['areas']: d[area] = np.zeros( int((params['t_max'] - params['t_min']) / params['resolution'])) for pop in self.network.structure[area]: d[area] += d_pops[area][pop] * self.network.N[area][pop] d[area] /= self.network.N[area]['total'] self.synaptic_input = d.to_dict() self.synaptic_input_pops = d_pops.to_dict()
def add_events(self, company_id, events, worker, allow_locked_tasks=False) -> Tuple[int, int, dict]: actions = [] task_ids = set() task_iteration = defaultdict(lambda: 0) task_last_scalar_events = nested_dict( 3, dict) # task_id -> metric_hash -> variant_hash -> MetricEvent task_last_events = nested_dict( 3, dict) # task_id -> metric_hash -> event_type -> MetricEvent errors_per_type = defaultdict(int) valid_tasks = self._get_valid_tasks( company_id, task_ids={ event["task"] for event in events if event.get("task") is not None }, allow_locked_tasks=allow_locked_tasks, ) for event in events: # remove spaces from event type event_type = event.get("type") if event_type is None: errors_per_type["Event must have a 'type' field"] += 1 continue event_type = event_type.replace(" ", "_") if event_type not in EVENT_TYPES: errors_per_type[f"Invalid event type {event_type}"] += 1 continue task_id = event.get("task") if task_id is None: errors_per_type["Event must have a 'task' field"] += 1 continue if task_id not in valid_tasks: errors_per_type["Invalid task id"] += 1 continue event["type"] = event_type # @timestamp indicates the time the event is written, not when it happened event["@timestamp"] = es_factory.get_es_timestamp_str() # for backward bomba-tavili-tea if "ts" in event: event["timestamp"] = event.pop("ts") # set timestamp and worker if not sent if "timestamp" not in event: event["timestamp"] = es_factory.get_timestamp_millis() if "worker" not in event: event["worker"] = worker # force iter to be a long int iter = event.get("iter") if iter is not None: iter = int(iter) event["iter"] = iter # used to have "values" to indicate array. no need anymore if "values" in event: event["value"] = event["values"] del event["values"] event["metric"] = event.get("metric") or "" event["variant"] = event.get("variant") or "" index_name = get_index_name(company_id, event_type) es_action = { "_op_type": "index", # overwrite if exists with same ID "_index": index_name, "_source": event, } # for "log" events, don't assing custom _id - whatever is sent, is written (not overwritten) if event_type != EventType.task_log.value: es_action["_id"] = self._get_event_id(event) else: es_action["_id"] = dbutils.id() task_ids.add(task_id) if (iter is not None and event.get("metric") not in self._skip_iteration_for_metric): task_iteration[task_id] = max(iter, task_iteration[task_id]) self._update_last_metric_events_for_task( last_events=task_last_events[task_id], event=event, ) if event_type == EventType.metrics_scalar.value: self._update_last_scalar_events_for_task( last_events=task_last_scalar_events[task_id], event=event) actions.append(es_action) action: Dict[dict] plot_actions = [ action["_source"] for action in actions if action["_source"]["type"] == EventType.metrics_plot.value ] if plot_actions: self.validate_and_compress_plots( plot_actions, validate_json=config.get("services.events.validate_plot_str", False), compression_threshold=config.get( "services.events.plot_compression_threshold", 100_000), ) added = 0 if actions: chunk_size = 500 with translate_errors_context(), TimingContext( "es", "events_add_batch"): # TODO: replace it with helpers.parallel_bulk in the future once the parallel pool leak is fixed with closing( helpers.streaming_bulk( self.es, actions, chunk_size=chunk_size, # thread_count=8, refresh=True, )) as it: for success, info in it: if success: added += 1 else: errors_per_type[ "Error when indexing events batch"] += 1 remaining_tasks = set() now = datetime.utcnow() for task_id in task_ids: # Update related tasks. For reasons of performance, we prefer to update # all of them and not only those who's events were successful updated = self._update_task( company_id=company_id, task_id=task_id, now=now, iter_max=task_iteration.get(task_id), last_scalar_events=task_last_scalar_events.get( task_id), last_events=task_last_events.get(task_id), ) if not updated: remaining_tasks.add(task_id) continue if remaining_tasks: TaskBLL.set_last_update(remaining_tasks, company_id, last_update=now) if not added: raise errors.bad_request.EventsNotAdded(**errors_per_type) errors_count = sum(errors_per_type.values()) return added, errors_count, errors_per_type
def flatten_params(params): return OrderedDict(('.'.join(k), Variable(v, requires_grad=True)) for k, v in nested_dict(params).iteritems_flat() if v is not None)
def search_user(request): if request.method=='POST': userid = request.POST.get('userid'); userrole = request.POST.get('userrole') #return render_to_response('search_result.html', {'userid':userid, 'userrole':userrole}) return render(request,'search_result.html', {'userid':userid, 'userrole':userrole}) elif request.method=='GET': if 'userid' in request.GET and request.GET['userid']: userid= request.GET['userid'] userrole=request.GET['userrole'] lifecycle_userrole = userrole.split('-') lifecycle = lifecycle_userrole[1] usertype = lifecycle_userrole[2] #userrole = str(lifecycle_userrole[1]).title() if lifecycle == "all": user_status = nested_dict() get_usrlist_frm_cpo_dev1 = USER_TABLE.objects.using('eon_rtp3_1_l').filter(username=userid).order_by('userid') for user in get_usrlist_frm_cpo_dev1: user_status['eon-rtp3-1-l']= user.subtypeofuser user_status['eon-rch1-1-l']= user.subtypeofuser get_usrlist_frm_cpo_dev2 = USER_TABLE.objects.using('eon_rtp3_2_l').filter(username=userid).order_by('userid') for user in get_usrlist_frm_cpo_dev2: user_status['eon-rtp3-2-l']= user.subtypeofuser user_status['eon-rtp5-1-l']= user.subtypeofuser cpo_dev_onramp_resource = {1:'cpo-dev-superuser',2:'cpo-dev-sysadmin',3:'cpo-dev-networkadmin',4:'cpo-dev-operator',5:'cpo-dev-provisioner',} for subtypeoduser,onramp_resource in cpo_dev_onramp_resource.iteritems(): #onramp_provisioned_users = get_cpo_provisioned_userlist(onramp_resource) onramp_provisioned_users = get_local_cpo_provisioned_userlist(request,userrole) logger.info("All user found under onramp resource : ") logger.debug(onramp_provisioned_users) #onramp_approved_usrlist=[] if onramp_provisioned_users != None: if userid in onramp_provisioned_users: logger.info("user found") user_status['onramp_dev']=subtypeoduser cpo_prod_onramp_resource = {1:'cpo-prod-superuser',2:'cpo-prod-sysadmin',3:'cpo-prod-networkadmin',4:'cpo-prod-operator',5:'cpo-prod-provisioner',} for subtypeoduser,onramp_resource in cpo_prod_onramp_resource.iteritems(): #onramp_provisioned_users = get_cpo_provisioned_userlist(onramp_resource) onramp_provisioned_users = get_local_cpo_provisioned_userlist(request,userrole) logger.info("All user found under onramp PROD resource : ") logger.debug(onramp_provisioned_users) #onramp_approved_usrlist=[] if onramp_provisioned_users != None: if userid in onramp_provisioned_users: user_status['onramp_prod']=subtypeoduser #Due to django template bug , defaultdict or nesteddict must be converted to python dict before passing to view user_status = dict(user_status) return render_to_response('search_result.html', {'userid':userid, 'userrole':userrole, 'lifecycle':lifecycle, 'user_status': sorted(user_status.items())},) #return render_to_response('search_result.html', {'userid':userid, 'userrole':userrole, 'lifecycle':lifecycle, # 'user_status': user_status},) elif lifecycle == 'dev': #-------------------------------------------------------------------------------------------------------- cpo_dev_onramp_resource = {'cpo-dev-superuser':1,'cpo-dev-sysadmin':2,'cpo-dev-networkadmin':3,'cpo-dev-operator':4,'cpo-dev-provisioner':5,} user_status = {} subtype_of_user= cpo_dev_onramp_resource[userrole] #get_usrlist_frm_cpo_dev1 = USER_TABLE.objects.using('eon_rtp3_1_l').filter(username=userid).filter(subtypeofuser=1).order_by('userid') get_usrlist_frm_cpo_dev1 = USER_TABLE.objects.using('eon_rtp3_1_l').filter(username=userid,subtypeofuser=subtype_of_user).order_by('userid') logger.info("**********Fetched data from dev1") logger.debug(get_usrlist_frm_cpo_dev1) for user in get_usrlist_frm_cpo_dev1: user_status['eon-rtp3-1-l']= user.subtypeofuser #get_usrlist_frm_cpo_dev2 = USER_TABLE.objects.using('eon_rtp3_2_l').filter(username=userid).filter(subtypeofuser=1).order_by('userid') get_usrlist_frm_cpo_dev2 = USER_TABLE.objects.using('eon_rtp3_2_l').filter(username=userid,subtypeofuser=subtype_of_user).order_by('userid') for user in get_usrlist_frm_cpo_dev2: user_status['eon-rtp3-2-l']= user.subtypeofuser #cpo_dev_onramp_resource = {1:'cpo-dev-superuser',2:'cpo-dev-sysadmin',3:'cpo-dev-netwrokadmin',4:'cpo-dev-operator',5:'cpo-dev-provisioner',} #onramp_provisioned_users = get_cpo_provisioned_userlist(userrole) onramp_provisioned_users = get_local_cpo_provisioned_userlist(request,userrole) logger.info("All user found under onramp resource : ") logger.debug(onramp_provisioned_users) if onramp_provisioned_users != None: if userid in onramp_provisioned_users: logger.info("user found") user_status['onramp_dev']=subtype_of_user #Due to django template bug , defaultdict or nesteddict must be converted to python dict before passing to view #user_status = dict(user_status) #-------------------------------------------------------------------------------------------------------- return render_to_response('search_result.html',{'userid':userid, 'userrole':userrole,'lifecycle':lifecycle,'user_status':sorted(user_status.items())}) elif lifecycle == 'prod': #-------------------------------------------------------------------------------------------------------- cpo_prod_onramp_resource = {'cpo-prod-superuser':1,'cpo-prod-sysadmin':2,'cpo-prod-networkadmin':3,'cpo-prod-operator':4,'cpo-prod-provisioner':5,} user_status = {} subtype_of_user= cpo_prod_onramp_resource[userrole] #get_usrlist_frm_cpo_dev1 = USER_TABLE.objects.using('eon_rtp3_1_l').filter(username=userid).filter(subtypeofuser=1).order_by('userid') get_usrlist_frm_cpo_prod1 = USER_TABLE.objects.using('eon_rtp3_1_l').filter(username=userid,subtypeofuser=subtype_of_user).order_by('userid') logger.info("**********Fetched data from dev1") logger.debug(get_usrlist_frm_cpo_prod1) for user in get_usrlist_frm_cpo_prod1: user_status['eon-rch1-1-l']= user.subtypeofuser get_usrlist_frm_cpo_prod2 = USER_TABLE.objects.using('eon_rtp3_2_l').filter(username=userid,subtypeofuser=subtype_of_user).order_by('userid') for user in get_usrlist_frm_cpo_prod2: user_status['eon-rtp5-1-l']= user.subtypeofuser #cpo_dev_onramp_resource = {1:'cpo-dev-superuser',2:'cpo-dev-sysadmin',3:'cpo-dev-netwrokadmin',4:'cpo-dev-operator',5:'cpo-dev-provisioner',} #onramp_provisioned_users = get_cpo_provisioned_userlist(userrole) onramp_provisioned_users = get_local_cpo_provisioned_userlist(request,userrole) logger.info("All user found under onramp resource : ") logger.debug(onramp_provisioned_users) if onramp_provisioned_users != None: if userid in onramp_provisioned_users: logger.info("user found") user_status['onramp_prod']=subtype_of_user #Due to django template bug , defaultdict or nesteddict must be converted to python dict before passing to view #user_status = dict(user_status) #-------------------------------------------------------------------------------------------------------- return render_to_response('search_result.html',{'userid':userid, 'userrole':userrole,'lifecycle':lifecycle,'user_status':sorted(user_status.items())}) else: return render_to_response('search_result.html',{'userid':userid, 'userrole':userrole,'lifecycle':lifecycle,'user_status':'Hello'}) else: error_message = 'Please provide valid userid' return render_to_response('search_result.html', {'error_msg':error_message})
def flatten_params(params): flat_params = OrderedDict() for keys, v in nested_dict(params).iteritems_flat(): if v is not None: flat_params['.'.join(keys)] = Variable(v, requires_grad=True) return flat_params
def plot_dir_loss(d, not_plot_dir_str, only_plot_dir_str, min_loss_col, savefn): fn_ls = os.listdir(d) for j in not_plot_dir_str.split(':'): fn_ls = [i for i in fn_ls if not j in i] if only_plot_dir_str != '.': fn_ls = [i for i in fn_ls if i in only_plot_dir_str.split(':')] fn_ls = [i for i in fn_ls if i != 'readme.txt'] loss_dict = nested_dict() loss_ls = [] for i in fn_ls: log = d + '/' + i + '/log.txt' train_shell = d + '/' + i + '/train.sh' model_parameter_dict = {} with open(train_shell, 'r') as TRAIN: for line in TRAIN: line = line.strip('\n') line = line.replace('RNA-structure-profile-imputation', 'ShapeImputation') if 'CUDA_VISIBLE_DEVICES' in line: continue elif '--' not in line: # SAVEFN.write(line+'\n') pass else: arr = line.strip(' ').split(' ') arr[1] = '' if len(arr) == 2 else arr[1] model_parameter_dict[arr[0]] = arr[1] # print(arr) # print(model_parameter_dict) if '--batch_size' not in model_parameter_dict: model_parameter_dict['--batch_size'] = 100 if '--test_batch_size' not in model_parameter_dict: model_parameter_dict['--test_batch_size'] = 100 if os.path.isfile(log): print('process: {}'.format(i)) log_plot_savefn = log.replace('.txt', '.plot.pdf') loss_df = util.read_log(log, savefn=log_plot_savefn, test_batch_size=int(model_parameter_dict['--test_batch_size'])) if loss_df.shape[0] < 10: continue loss_min = loss_df.loc[loss_df[min_loss_col].idxmin()] loss_min.loc['epoch',] = loss_min.name loss_min.name = i # print(loss_min) loss_ls.append(loss_min) loss_df_all = pd.concat(loss_ls, axis=1) loss_df_all = loss_df_all[sorted(loss_df_all.columns)] print(loss_df_all) savefn_txt = savefn.replace('.pdf', '.csv') loss_df_all_T = loss_df_all.T loss_df_all_T['epoch'].dtype == 'int' loss_df_all_T.to_csv(savefn_txt, header=True, index=True, sep='\t', float_format='%.5f') fig,ax=plt.subplots(figsize=(max(8,0.5*len(fn_ls)),28)) cols = ['validate loss (train_nonull_validate_nonull)', 'validate loss (train_hasnull_validate_hasnull)', 'validate loss (train_hasnull_validate_onlynull)', 'validate loss (train_hasnull_validate_nonull)'] for col in cols: ax.plot(loss_df_all.loc[col,], label=col, marker='.') for i in range(0, loss_df_all.shape[1]): plt.axvline(x=i, ymin=0, ymax=1, ls='--', lw='0.2', color='grey') plt.xticks(range(0, len(loss_df_all.columns)), loss_df_all.columns, rotation=90) plt.legend() plt.tight_layout() plt.savefig(savefn) plt.close()
def dest_reveal_new(G, adversary, delay, amount, pre, next): T = nd.nested_dict() flag1 = True anon_sets = nd.nested_dict() level = 0 index = 0 T[0]["nodes"] = [next] T[0]["delays"] = [delay] print(delay) T[0]["previous"] = [-1] T[0]["visited"] = [[pre, adversary, next]] T[0]["amounts"] = [amount] # pr = pf.edge_prob(G.edges[pre,adversary]["LastFailure"])*pf.edge_prob(G.edges[adversary,next]["LastFailure"]) # T[0]["probs"] = [pr] x = -1 # if T[0]["delays"][0] == 0: # maybe_targets[index]["target"] = next # maybe_targets[index]["path"] = [adversary,next] # maybe_targets[index]["delay"] = delay # maybe_targets[index]["amt"] = amount # maybe_targets[index]["tech"] = 0 # maybe_targets[index]["sources"] = source_reveal(G, [pre, adversary,next], 0, 0, amount, pre, next,adversary) # index += 1 # paths = pf.Dijkstra_all_paths(G,next,amount,pf.lnd_cost_fun) # for u in paths: # if pre in paths[u]: # ind = paths[u].index(pre) # if(paths[u][ind:] == [pre,adversary,next]): # anon_sets[index] = [u,next] # print("match",u,next) paths = nd.nested_dict() num_paths = 0 flag = True while (flag): level += 1 if (level == 3): flag1 = False break t1 = T[level - 1]["nodes"] d1 = T[level - 1]["delays"] p1 = T[level - 1]["previous"] v1 = T[level - 1]["visited"] a1 = T[level - 1]["amounts"] # pr1 = T[level - 1]["probs"] t2 = [] d2 = [] p2 = [] v2 = [[]] a2 = [] # pr2 = [] for i in range(0, len(t1)): u = t1[i] # if v1[i] == []: # print(u) # print("yes",u) for [u, v] in G.out_edges(u): #print(v) # p = p1[i] # flag1 = 0 # level2 = level - 2 # while(level2>=1): # if(T[level2]["nodes"][p] == v): # flag1 = 1 # break # else: # p = T[level2]["previous"][p] # level2 = level2 - 1 # pr = pf.edge_prob(G.edges[u,v]["LastFailure"])*pr1[i] if (v != pre and v != adversary and v != next and v not in v1[i] and (G.edges[u, v]["Balance"] + G.edges[v, u]["Balance"]) >= ((a1[i] - G.edges[u, v]["BaseFee"]) / (1 + G.edges[u, v]["FeeRate"]))): t2.append(v) d2.append(d1[i] - G.edges[u, v]["Delay"]) p2.append(i) v2.append(v1[i] + [v]) a2.append(((a1[i] - G.edges[u, v]["BaseFee"]) / (1 + G.edges[u, v]["FeeRate"]))) # pr2.append(pr) T[level]["nodes"] = t2 #print(level,t2,d2) T[level]["delays"] = d2 T[level]["previous"] = p2 T[level]["visited"] = v2 T[level]["amounts"] = a2 # T[level]["probs"] = pr2 #print(t2,d2,p2) print(level, len(t2)) if (len(t2) == 0): flag = False level = level - 1 while (level >= 0): t = T[level]["nodes"] d = T[level]["delays"] p = T[level]["previous"] a = T[level]["amounts"] v = T[level]["visited"] #print(level) # if(level == 0): # print(t,d) for i in range(0, len(t)): #if(d[i] == 0): path = [] level1 = level path.append(T[level1]["nodes"][i]) loc = T[level1]["previous"][i] while (level1 > 0): level1 = level1 - 1 path.append(T[level1]["nodes"][loc]) loc = T[level1]["previous"][loc] path.reverse() path = [pre, adversary] + path if (len(path) == len(set(path))): #print(path, level) amt = a[i] pot = path[len(path) - 1] sources = deanonymize(G, pot, path, amt, pf.lnd_cost_fun) if sources != None: #print("match",pot) anon_sets[pot] = list(sources) # if paths == [pre, adversary] + path: # maybe_targets[index]["target"] = pot # maybe_targets[index]["path"] = [adversary] + path # maybe_targets[index]["delay"] = delay # maybe_targets[index]["amt"] = amt # maybe_targets[index]["tech"] = 0 # maybe_targets[index]["sources"] = source_reveal(G, [pre, adversary] + path, 0, 0, amt, pre, next, # adversary) # index += 1 # for u in paths: # if pre in paths[u]: # ind = paths[u].index(pre) # if paths[u][ind:] == [pre,adversary] + path: # anon_sets[index] = [u,pot] # index+=1 # print("match",u,pot) level = level - 1 return anon_sets, flag1
def deanonymize(G, target, path, amt, cost_function): # if(target == 500): # print("target", 6946,path) pq = PriorityQueue() delays = {} costs = {} paths = nd.nested_dict() paths1 = nd.nested_dict() dists = {} visited = set() previous = {} done = {} prob = {} sources = [] pre = path[0] adv = path[1] nxt = path[2] for node in G.nodes(): previous[node] = -1 delays[node] = -1 costs[node] = max paths[node] = [] dists[node] = max done[node] = 0 paths1[node] = [] prob[node] = 1 dists[target] = 0 paths[target] = [target] costs[target] = amt delays[target] = 0 pq.put((dists[target], target)) flag1 = 0 flag2 = 0 while (0 != pq.qsize()): curr_cost, curr = pq.get() if curr_cost > dists[curr]: continue visited.add(curr) if target == 500 and (curr == pre): print(curr) for [v, curr] in G.in_edges(curr): # if v == pre and curr == adv: # print("yes1", pre,curr,G.edges[v, curr]["Balance"] + G.edges[curr, v]["Balance"],costs[curr]) if (G.edges[v, curr]["Balance"] + G.edges[curr, v]["Balance"] >= costs[curr]) and v not in visited: # if v==pre: # print("yes",pre) if done[v] == 0: paths1[v] = [v] + paths[curr] done[v] = 1 cost = dists[curr] + cost_function(G, costs[curr], curr, v) if cost < dists[v]: paths[v] = [v] + paths[curr] # if v==pre: # print(v,paths[v]) dists[v] = cost delays[v] = delays[curr] + G.edges[v, curr]["Delay"] costs[v] = costs[curr] + G.edges[v, curr][ "BaseFee"] + costs[curr] * G.edges[v, curr]["FeeRate"] # prob[v] = pf.edge_prob(G.edges[v,curr]["LastFailure"])*prob[curr] pq.put((dists[v], v)) if (curr in path[1:]): ind = path.index(curr) if (paths[curr] != path[ind:]): return None if curr == adv: #print("ad", paths[curr]) flag1 = 1 if (curr == pre): # print(pre,paths[pre]) if paths[pre] != path: return [pre] else: sources.append(pre) flag2 = 1 if flag1 == 1 and flag2 == 1: if pre in paths[curr]: for [v, curr] in G.in_edges(curr): if v not in paths[curr]: sources.append(v) sources = set(sources) return sources
def __init__(self, cache_cfg_file): self.cache_str = "" self.cache_type = "" self.cache_cfg_file = cache_cfg_file self.section_dic = nested_dict()
def deanonymize_c(G, target, path, amt, fuzz): pq = PriorityQueue() cost_function = pf.c_cost_fun(fuzz) delays = {} costs = {} paths = nd.nested_dict() paths1 = nd.nested_dict() dists = {} visited = set() previous = {} done = {} # prob = {} sources = [] pre = path[0] adv = path[1] nxt = path[2] for node in G.nodes(): previous[node] = -1 delays[node] = -1 costs[node] = max paths[node] = [] dists[node] = max done[node] = 0 paths1[node] = [] # prob[node] = 1 dists[target] = 0 paths[target] = [target] costs[target] = amt delays[target] = 0 pq.put((dists[target], target)) flag1 = 0 flag2 = 0 while (0 != pq.qsize()): curr_cost, curr = pq.get() if curr_cost > dists[curr]: continue visited.add(curr) for [v, curr] in G.in_edges(curr): if (G.edges[v, curr]["Balance"] + G.edges[curr, v]["Balance"] >= costs[curr]) and v not in visited: if done[v] == 0 and G.nodes[v]["Tech"] == 1: paths1[v] = [v] + paths[curr] done[v] = 1 cost = dists[curr] + cost_function(G, costs[curr], curr, v) if cost < dists[v]: paths[v] = [v] + paths[curr] dists[v] = cost delays[v] = delays[curr] + G.edges[v, curr]["Delay"] costs[v] = costs[curr] + G.edges[v, curr][ "BaseFee"] + costs[curr] * G.edges[v, curr]["FeeRate"] # prob[v] = pf.edge_prob(G.edges[v,curr]["LastFailure"])*prob[curr] pq.put((dists[v], v)) # If at any point the sub-path found is not found to be optimal, this is definetely not the destination if using lnd since the sub-path from an intermediary to # the destination has to be the cheapest path from the intermediary to the destination. if (curr in path[1:]): ind = path.index(curr) if (paths[curr] != path[ind:]): return [] if curr == adv: flag1 = 1 if (curr == pre): # If pre is the source, the path from pre need to not match the path found since, the cost from the source to the second node is computed differently. # Moreover, the source would not choose the absolute cheapest path since the first hop may not have sufficient forward balance. # Thus, pre has to be the source if the paths dont match, since the paths would only match if pre is an intermediary. if paths[pre] != path: return [pre] else: # if the paths do match, pre is just one possible source sources.append(pre) flag2 = 1 if flag1 == 1 and flag2 == 1: # since if pre is in the path from curr, the path from pre has to match the path we had found as it is the cheapest path from pre. This measns that curr # is a valid second node. So, all neighbors of curr that have not occured in the path are potential sources. if pre in paths[curr]: for [v, curr] in G.in_edges(curr): if v not in paths[curr] and G.nodes[v]["Tech"] == 1: sources.append(v) sources = list(set(sources)) return sources
def flatten_stats(stats): flat_stats = OrderedDict() for keys, v in nested_dict(stats).iteritems_flat(): flat_stats['.'.join(keys)] = v return flat_stats
def main(file_name, stop_words, model_output, baseline_output): # Read in the survey CSV df = get_dataframe_from_csv(file_name) # Create training data df2018 = get_training_data(df) # Create classes list classes = ['story', 'ask_hn', 'show_hn', 'poll'] # Dictionary that will hold all necessary data related to word # such as : frequency and conditional probability for each class words_dict = nd.nested_dict() # Training data for classes for class_name in classes: df2018_class_name = df2018[df2018['Post Type'] == class_name] list_of_story_title = df_title_to_list(df2018_class_name) words_dict = title_to_vocab(list_of_story_title, class_name, words_dict) # For experiment 1.3.1 and 1.3.3 # Remove list of predefined words from vocabulary for word in stop_words: words_dict.pop(word, None) # For experiment 1.3.2 if model_output == "wordlength-model.txt": words_dict_iterator = dict( words_dict ) # recreating another copy of the dict for iteration because size changes for word in words_dict_iterator: if len(word) <= 2 or len(word) >= 9: words_dict.pop(word, None) # Fill missing values for specific class in dictionary for each word frequency with 0 # To prevent calculation errors later for class_name in classes: fill_non_existing_values(words_dict, class_name) # Convert dict of frequencies to pandas dataframe to simplify later calculations df_word_frequencies = convert_dict_to_dataframe(words_dict) # generate smoothed conditinal probabilities for each word in class for class_name in classes: words_dict = get_conditional_probability(words_dict, df_word_frequencies, class_name) # generate model output file if len(model_output) > 0: generate_model_file(model_output, words_dict) # PART 2 # Create testing data df2019 = get_testing_data(df) # Find all duplicate rows (titles) # get_duplicated_titles(df2019) # Getting priors of testing set dict_priors = generate_priors(df2018) # Transforming column of pandas dataframe such as key: title, value: post Type # del dict dict_titles_testing = dict(zip(df2019['Title'], df2019['Post Type'])) # Initializing dictionary that will hold score of each title for each class with prediction title_score_dict = nd.nested_dict() # Getting list of titles list_of_title = df_title_to_list(df2019) # Classification using Naive Bays Classification : title_score_dict = classify_using_naive_bays(list_of_title, dict_titles_testing, dict_priors, words_dict) # Generating Baseline File if len(baseline_output) > 0: generate_baseline_file(baseline_output, title_score_dict) # Return prediction and estimation list to calculate performance if len(baseline_output) == 0: result = get_y_true_y_prediction(title_score_dict) return result if baseline_output == "baseline-result.txt": result = get_y_true_y_prediction(title_score_dict) result.append(len(words_dict)) generate_vocab_file(words_dict) generate_remove_word_file() return result
import iothub_client import serial import json import time import datetime import sys from iothub_client import * from nested_dict import nested_dict message_timeout = 10000 receive_context = 0 IoTMessage = '' counter = 0 workingSecond = datetime.datetime.now().second sensorDataDict = nested_dict(2, int) protocol = IoTHubTransportProvider.AMQP # example connection_string = "HostName=myhub.azure-devices.net;DeviceId=mydevice;SharedAccessKey=mykey=" connection_string = "{ Insert string here }" def receive_message_callback(message, counter): global weHavePicture, IoTMessage buffer = message.get_bytearray() size = len(buffer) message = buffer[:size].decode('utf-8') print("Message received: %s" % message) try: IoTMessage = json.loads(message) except:
def rep_compare(rep1_out=None, rep1_validate=None, rep1_predict=None, rep2_out=None, rep2_validate=None, rep2_predict=None, tx_null_pct=0.3, savefn=None): if rep1_out is None: rep1_out = '/home/gongjing/project/shape_imputation/data/hek_wc_vivo/3.shape_rep1/shape.c200T2M0m0.out' if rep1_validate is None: rep1_validate = '/home/gongjing/project/shape_imputation/data/hek_wc_vivo/3.shape_rep1/shape.c200T2M0m0.out.windowsHasNull/windowLen100.sliding100.validation.txt' if rep1_predict is None: rep1_predict = '/home/gongjing/project/shape_imputation/data/hek_wc_vivo/3.shape_rep1/shape.c200T2M0m0.out.windowsHasNull/windowLen100.sliding100.validation.prediction_trainHasNull_lossAll.txt' if rep2_out is None: rep2_out = '/home/gongjing/project/shape_imputation/data/hek_wc_vivo/3.shape_rep2/shape.c200T2M0m0.out' if rep2_validate is None: rep2_validate = '/home/gongjing/project/shape_imputation/data/hek_wc_vivo/3.shape_rep2/shape.c200T2M0m0.out.windowsHasNull/windowLen100.sliding100.validation.txt' if rep2_predict is None: rep2_predict = '/home/gongjing/project/shape_imputation/data/hek_wc_vivo/3.shape_rep2/shape.c200T2M0m0.out.windowsHasNull/windowLen100.sliding100.validation.prediction_trainHasNull_lossAll.txt' if savefn is None: savefn = '/home/gongjing/project/shape_imputation/data/hek_wc_vivo/3.shape/shape.c200T2M0m0.out.windowsHasNull/shape_dist/rep.corr.txt' cols = [ 'tx', 'length', 'start', 'end', 'mean_reactivity', 'null_pct', 'seq', 'fragment_shape', 'fragment_shape(true)' ] df_rep1_validate = pd.read_csv(rep1_validate, header=None, sep='\t') df_rep1_validate.columns = cols df_rep1_predict = pd.read_csv(rep1_predict, header=None, sep='\t') df_rep1_validate['fragment_shape(predict)'] = df_rep1_predict[0] df_rep2_validate = pd.read_csv(rep2_validate, header=None, sep='\t') df_rep2_validate.columns = cols df_rep2_predict = pd.read_csv(rep2_predict, header=None, sep='\t') df_rep2_validate['fragment_shape(predict)'] = df_rep2_predict[0] df_rep = df_rep1_validate.merge(df_rep2_validate, how='inner', on=['tx', 'start', 'end']) print("df shape: rep1,rep2,rep", df_rep1_validate.shape, df_rep2_validate.shape, df_rep.shape) out_dict1 = util.read_icshape_out(rep1_out) out_dict2 = util.read_icshape_out(rep2_out) valid_tx = [] for i in df_rep['tx']: rep1_null_pct = out_dict1[i]['reactivity_ls'].count('NULL') / ( float(out_dict1[i]['length']) - 35) rep2_null_pct = out_dict2[i]['reactivity_ls'].count('NULL') / ( float(out_dict2[i]['length']) - 35) if rep1_null_pct > tx_null_pct: continue if rep2_null_pct > tx_null_pct: continue valid_tx.append(i) print("tx valid number", len(set(valid_tx))) df_rep = df_rep[df_rep['tx'].isin(valid_tx)] df_rep.to_csv(savefn, header=True, index=False, sep='\t') tx_shape_dict = nested_dict(2, list) for tx in set(valid_tx): df_tx = df_rep[df_rep['tx'] == tx] for index, i in df_tx.iterrows(): for v in i['fragment_shape(true)_x'].split(','): tx_shape_dict[tx]['rep1_before'].append(float(v)) for v in i['fragment_shape(true)_y'].split(','): tx_shape_dict[tx]['rep2_before'].append(float(v)) for v in i['fragment_shape(predict)_x'].split(','): tx_shape_dict[tx]['rep1_after'].append(float(v)) for v in i['fragment_shape(predict)_y'].split(','): tx_shape_dict[tx]['rep2_after'].append(float(v)) corr_dict = nested_dict(2, int) for tx in tx_shape_dict: v1 = [ i for i, j in zip(tx_shape_dict[tx]['rep1_before'], tx_shape_dict[tx]['rep2_before']) if i >= 0 and j >= 0 ] v2 = [ j for i, j in zip(tx_shape_dict[tx]['rep1_before'], tx_shape_dict[tx]['rep2_before']) if i >= 0 and j >= 0 ] v3 = [ i for i, j in zip(tx_shape_dict[tx]['rep1_after'], tx_shape_dict[tx]['rep2_after']) if i >= 0 and j >= 0 ] v4 = [ j for i, j in zip(tx_shape_dict[tx]['rep1_after'], tx_shape_dict[tx]['rep2_after']) if i >= 0 and j >= 0 ] # print(tx,v1,v2,v3,v4) if len(v1) <= 10 or len(v3) <= 10: continue c1, p1 = stats.pearsonr(v1, v2) c2, p2 = stats.pearsonr(v3, v4) corr_dict[tx]['corr_before'] = c1 corr_dict[tx]['corr_before(p)'] = p1 corr_dict[tx]['corr_before(n)'] = len(v1) corr_dict[tx]['corr_after'] = c2 corr_dict[tx]['corr_after(p)'] = p2 corr_dict[tx]['corr_after(n)'] = len(v3) corr_df = pd.DataFrame.from_dict(corr_dict, orient='index') # print(corr_df) corr_df.to_csv(savefn.replace('.pdf', '.txt'), header=True, index=True, sep='\t') corr_df['# imputated nt'] = corr_df['corr_after(n)'] - corr_df[ 'corr_before(n)'] fig, ax = plt.subplots(figsize=(8, 8)) sns.scatterplot(x='corr_before', y='corr_after', data=corr_df, ax=ax, hue='# imputated nt') ax.set_xlim(0.2, 1.05) ax.set_ylim(0.2, 1.05) plt.xticks([0.2, 0.4, 0.6, 0.8, 1.0], [0.2, 0.4, 0.6, 0.8, 1.0]) plt.yticks([0.2, 0.4, 0.6, 0.8, 1.0], [0.2, 0.4, 0.6, 0.8, 1.0]) plt.tight_layout() plt.savefig(savefn) plt.close()
def flatten_stats(stats): return OrderedDict( ('.'.join(k), v) for k, v in nested_dict(stats).iteritems_flat())
def matrix_to_dict(m, area_list, structure, external=None): """ Convert a matrix containing connectivity information of a network defined by structure to a dictionary. Parameters ---------- m : array-like Matrix to be converted. area_list: list List of areas in the network. Defines the order of areas in the matrix to be created. structure : dict Structure of the network. Define the populations for each single area. external: numpy.ndarray or dict If None, do not include connectivity from external sources in the return dictionary. If numpy.ndarray or dict, use the connectivity given to add an entry 'external' for each population. Defaults to None. """ dic = nested_dict() for area, area2 in product(area_list, area_list): mask = create_mask(structure, target_areas=[area], source_areas=[area2], external=False) if external is not None: x = m[mask[:, :-1]] else: x = m[mask] if area == 'TH' and area2 == 'TH': x = x.reshape((6, 6)) x = np.insert(x, 2, np.zeros((2, 6), dtype=float), axis=0) x = np.insert(x, 2, np.zeros((2, 8), dtype=float), axis=1) elif area2 == 'TH': x = x.reshape((8, 6)) x = np.insert(x, 2, np.zeros((2, 8), dtype=float), axis=1) elif area == 'TH': x = x.reshape((6, 8)) x = np.insert(x, 2, np.zeros((2, 8), dtype=float), axis=0) else: x = x.reshape((8, 8)) for i, pop in enumerate(population_list): for j, pop2 in enumerate(population_list): if x[i][j] < 1e-20: x[i][j] = 0. dic[area][pop][area2][pop2] = x[i][j] if external is not None: if isinstance(external, np.ndarray): for area in dic: for pop in population_list: if pop in structure[area]: mask = create_vector_mask(structure, areas=[area], pops=[pop]) dic[area][pop]['external'] = { 'external': external[mask][0] } else: dic[area][pop]['external'] = {'external': 0.} if isinstance(external, dict): for area in dic: for pop in dic[area]: dic[area][pop]['external'] = external[area][pop] return dic.to_dict()
def deanonymize_c(G, target, path, amt, fuzz): pq = PriorityQueue() cost_function = pf.c_cost_fun(fuzz) delays = {} costs = {} paths = nd.nested_dict() paths1 = nd.nested_dict() dists = {} visited = set() previous = {} done = {} # prob = {} sources = [] pre = path[0] adv = path[1] nxt = path[2] for node in G.nodes(): previous[node] = -1 delays[node] = -1 costs[node] = max paths[node] = [] dists[node] = max done[node] = 0 paths1[node] = [] # prob[node] = 1 dists[target] = 0 paths[target] = [target] costs[target] = amt delays[target] = 0 pq.put((dists[target], target)) flag1 = 0 flag2 = 0 while (0 != pq.qsize()): curr_cost, curr = pq.get() if curr_cost > dists[curr]: continue visited.add(curr) for [v, curr] in G.in_edges(curr): if (G.edges[v, curr]["Balance"] + G.edges[curr, v]["Balance"] >= costs[curr]) and v not in visited: if done[v] == 0 and G.nodes[v]["Tech"] == 1: paths1[v] = [v] + paths[curr] done[v] = 1 cost = dists[curr] + cost_function(G, costs[curr], curr, v) if cost < dists[v]: paths[v] = [v] + paths[curr] dists[v] = cost delays[v] = delays[curr] + G.edges[v, curr]["Delay"] costs[v] = costs[curr] + G.edges[v, curr][ "BaseFee"] + costs[curr] * G.edges[v, curr]["FeeRate"] # prob[v] = pf.edge_prob(G.edges[v,curr]["LastFailure"])*prob[curr] pq.put((dists[v], v)) if (curr in path[1:]): ind = path.index(curr) if (paths[curr] != path[ind:]): return [] if curr == adv: flag1 = 1 # if flag1 == 1: # print("path", paths[adv]) if (curr == pre): if paths[pre] != path: if G.nodes[pre]["Tech"] != 1: return [] return [pre] else: if G.nodes[pre]["Tech"] == 1: sources.append(pre) flag2 = 1 if flag1 == 1 and flag2 == 1: if pre in paths[curr]: for [v, curr] in G.in_edges(curr): if v not in paths[curr] and G.nodes[v]["Tech"] == 1: sources.append(v) sources = list(set(sources)) return sources
def load_degree_data(fn): """ Load connectivity information from json file and store indegrees in dictionary. Parameters ---------- fn : string File name of json file. The file has to contain a dictionary with a subdictionary called 'synapses' containing the synapses between any pair of populations at the top level. Returns ------- indegrees : dict Indegrees on population level. Dictionary levels are sorted as target area --> target population --> source area --> source population. indegrees_areas : dict Indegrees on area level. Dictionary levels are sorted as target area --> source area outdegrees : dict Outdegrees on population level. Dictionary levels are sorted as target area --> target population --> source area --> source population. outdegrees : dict Outdegrees on area level. Dictionary levels are sorted as target area --> source area """ f = open(fn) dat = json.load(f) f.close() syn = dat['synapses'] num = dat['neuron_numbers'] indegrees = nested_dict() outdegrees = nested_dict() for target_area, target_pop, source_area, source_pop in product( complete_area_list, population_list, complete_area_list, population_list): numT = num[target_area][target_pop] if numT > 0.0: indegrees[target_area][target_pop][source_area][source_pop] = syn[ target_area][target_pop][source_area][source_pop] / numT else: # assign 0 to indegrees onto non-existing populations indegrees[target_area][target_pop][source_area][source_pop] = 0.0 if source_area != 'external': numS = num[source_area][source_pop] if numS > 0.0: outdegrees[target_area][target_pop][source_area][ source_pop] = syn[target_area][target_pop][source_area][ source_pop] / numS else: # assign 0 to outdegrees from non-existing populations outdegrees[target_area][target_pop][source_area][ source_pop] = 0.0 for target_area, target_pop, ext_pop in product(complete_area_list, population_list, ['external']): numT = num[target_area][target_pop] if numT > 0.0: indegrees[target_area][target_pop]['external'][ext_pop] = syn[ target_area][target_pop]['external'][ext_pop] / numT else: indegrees[target_area][target_pop]['external'][ext_pop] = 0.0 indegrees_areas = area_level_dict(indegrees, num, degree='indegree') outdegrees_areas = area_level_dict(outdegrees, num, degree='outdegree') return (indegrees.to_dict(), indegrees_areas, outdegrees.to_dict(), outdegrees_areas)
def coll_adv_attack(self, G, adversary, delay, amount, pre, next, advpath): # tp1_begin = time.time() T = nd.nested_dict() anon_sets = {} flag1 = True level = 0 T[0]["nodes"] = [next] T[0]["delays"] = [delay] T[0]["previous"] = [-1] T[0]["visited"] = [[pre,adversary,next]] T[0]["amounts"] = [amount] flag = True # cache sources for all candidate dovetail nodes (they are identical) sourcesets = {} while(flag): level+=1 if(level == 4): flag1 = False break t1 = T[level - 1]["nodes"] d1 = T[level - 1]["delays"] v1 = T[level - 1]["visited"] a1 = T[level - 1]["amounts"] t2 = [] d2 = [] p2 = [] v2 = [[]] a2 = [] for i in range(0,len(t1)): u = t1[i] for [u,v] in G.out_edges(u): if(v!=pre and v!=adversary and v!=next and v not in v1[i] and (d1[i] - G.edges[u,v]["Delay"])>=0 and (G.edges[u,v]["Balance"]+G.edges[v,u]["Balance"])>=((a1[i] - G.edges[u, v]["BaseFee"]) / (1 + G.edges[u, v]["FeeRate"]))): t2.append(v) d2.append(d1[i] - G.edges[u,v]["Delay"]) p2.append(i) v2.append(v1[i]+[v]) a2.append(((a1[i] - G.edges[u, v]["BaseFee"]) / (1 + G.edges[u, v]["FeeRate"]))) T[level]["nodes"] = t2 T[level]["delays"] = d2 T[level]["previous"] = p2 T[level]["visited"] = v2 T[level]["amounts"] = a2 if(len(t2) == 0): flag = False level = level - 1 while(level>=0): t = T[level]["nodes"] d = T[level]["delays"] p = T[level]["previous"] a = T[level]["amounts"] v = T[level]["visited"] for i in range(0, len(t)): if d[i] == 0: path = [] level1 = level path.append(T[level1]["nodes"][i]) loc = T[level1]["previous"][i] while (level1 > 0): level1 = level1 - 1 path.append(T[level1]["nodes"][loc]) loc = T[level1]["previous"][loc] path.reverse() path = [pre,adversary]+path if (len(path) == len(set(path))): amt = a[i] dl = d[i] pot = path[len(path) - 1] # find earliest possible dovetail node for this path dove, dove_index = self.get_dovetail(G, advpath, path, amt) if dove_index != -1: if dove not in sourcesets: fullpath = advpath + path[1:] ind = len(fullpath) - 1 while ind > dove_index: amt += G.edges[fullpath[ind-1], fullpath[ind]]["BaseFee"] + amt * G.edges[fullpath[ind-1], fullpath[ind]]["FeeRate"] dl += G.edges[fullpath[ind-1], fullpath[ind]]["Delay"] ind -= 1 # tp2_begin = time.time() sources = self.deanonymize(G,dove,fullpath[:dove_index+1],amt,dl) # tp2_end = time.time() # print("Time for candidate {}: {} seconds".format(pot, tp2_end - tp2_begin)) sourcesets[dove] = sources else: # print("Source from cache of dovetail {}.".format(dove)) sources = sourcesets[dove] if sources != None and len(sources) > 0: anon_sets[pot] = list(sources) # else: # print("Dovetail not found for candidate {}".format(pot)) level = level - 1 # tp1_end = time.time() # print("Time for full: {} seconds".format( tp1_end - tp1_begin)) return anon_sets, flag1
def add_events(self, company_id, events, worker): actions = [] task_ids = set() task_iteration = defaultdict(lambda: 0) task_last_events = nested_dict( 3, dict) # task_id -> metric_hash -> variant_hash -> MetricEvent for event in events: # remove spaces from event type if "type" not in event: raise errors.BadRequest("Event must have a 'type' field", event=event) event_type = event["type"].replace(" ", "_") if event_type not in EVENT_TYPES: raise errors.BadRequest( "Invalid event type {}".format(event_type), event=event, types=EVENT_TYPES, ) event["type"] = event_type # @timestamp indicates the time the event is written, not when it happened event["@timestamp"] = es_factory.get_es_timestamp_str() # for backward bomba-tavili-tea if "ts" in event: event["timestamp"] = event.pop("ts") # set timestamp and worker if not sent if "timestamp" not in event: event["timestamp"] = es_factory.get_timestamp_millis() if "worker" not in event: event["worker"] = worker # force iter to be a long int iter = event.get("iter") if iter is not None: iter = int(iter) event["iter"] = iter # used to have "values" to indicate array. no need anymore if "values" in event: event["value"] = event["values"] del event["values"] index_name = EventMetrics.get_index_name(company_id, event_type) es_action = { "_op_type": "index", # overwrite if exists with same ID "_index": index_name, "_type": "event", "_source": event, } # for "log" events, don't assing custom _id - whatever is sent, is written (not overwritten) if event_type != "log": es_action["_id"] = self._get_event_id(event) else: es_action["_id"] = dbutils.id() task_id = event.get("task") if task_id is not None: es_action["_routing"] = task_id task_ids.add(task_id) if iter is not None: task_iteration[task_id] = max(iter, task_iteration[task_id]) if event_type == EventType.metrics_scalar.value: self._update_last_metric_event_for_task( task_last_events=task_last_events, task_id=task_id, event=event) else: es_action["_routing"] = task_id actions.append(es_action) if task_ids: # verify task_ids with translate_errors_context(), TimingContext( "mongo", "task_by_ids"): res = Task.objects(id__in=task_ids, company=company_id).only("id") if len(res) < len(task_ids): invalid_task_ids = tuple( set(task_ids) - set(r.id for r in res)) raise errors.bad_request.InvalidTaskId( company=company_id, ids=invalid_task_ids) errors_in_bulk = [] added = 0 chunk_size = 500 with translate_errors_context(), TimingContext("es", "events_add_batch"): # TODO: replace it with helpers.parallel_bulk in the future once the parallel pool leak is fixed with closing( helpers.streaming_bulk( self.es, actions, chunk_size=chunk_size, # thread_count=8, refresh=True, )) as it: for success, info in it: if success: added += chunk_size else: errors_in_bulk.append(info) remaining_tasks = set() now = datetime.utcnow() for task_id in task_ids: # Update related tasks. For reasons of performance, we prefer to update all of them and not only those # who's events were successful updated = self._update_task( company_id=company_id, task_id=task_id, now=now, iter=task_iteration.get(task_id), last_events=task_last_events.get(task_id), ) if not updated: remaining_tasks.add(task_id) continue if remaining_tasks: TaskBLL.set_last_update(remaining_tasks, company_id, last_update=now) # Compensate for always adding chunk_size on success (last chunk is probably smaller) added = min(added, len(actions)) return added, errors_in_bulk
def deanonymize(self, G,target,path,amt,dl): pq = PriorityQueue() delays = {} costs = {} paths = nd.nested_dict() paths1 = nd.nested_dict() dists = {} visited = set() previous = {} done = {} prob = {} sources = [] pre = path[0] adv = path[1] # nxt = path[2] for node in G.nodes(): previous[node] = -1 delays[node] = -1 costs[node] = inf paths[node] = [] dists[node] = inf done[node] = 0 paths1[node] = [] prob[node] = 1 dists[target] = 0 paths[target] = [target] costs[target] = amt delays[target] = dl pq.put((dists[target],target)) flag1 = 0 flag2 = 0 while(0!=pq.qsize()): curr_cost,curr = pq.get() if curr_cost > dists[curr]: continue visited.add(curr) for [v,curr] in G.in_edges(curr): if (G.edges[v, curr]["Balance"] + G.edges[curr, v]["Balance"] >= costs[curr]) and v not in visited: if done[v] == 0: paths1[v] = [v]+paths[curr] done[v] = 1 cost = dists[curr] + self.cost_function(G,costs[curr],curr,v) if cost < dists[v]: paths[v] = [v]+paths[curr] dists[v] = cost delays[v] = delays[curr] + G.edges[v,curr]["Delay"] costs[v] = costs[curr] + G.edges[v, curr]["BaseFee"] + costs[curr] * G.edges[v, curr]["FeeRate"] pq.put((dists[v],v)) if(curr in path[1:]): ind = path.index(curr) if(paths[curr]!=path[ind:]): return None if curr == adv: flag1 = 1 if(curr == pre): if paths[pre] != path: return [pre] else: sources.append(pre) flag2 = 1 if flag1 == 1 and flag2 == 1: if pre in paths[curr]: for [v,curr] in G.in_edges(curr): if v not in paths[curr]: sources.append(v) sources = set(sources) return sources
def create_synchrony(self, **keywords): """ Calculate synchrony as the coefficient of variation of the population rate and store in member synchrony. Uses helper function synchrony. If the synchrony has previously been stored with the same parameters, they are loaded from file. Parameters ---------- t_min : float, optional Minimal time in ms of the simulation to take into account for the calculation. Defaults to 500 ms. t_max : float, optional Maximal time in ms of the simulation to take into account for the calculation. Defaults to the simulation time. areas : list, optional Which areas to include in the calculcation. Defaults to all loaded areas. pops : list or {'complete'}, optional Which populations to include in the calculation. If set to 'complete', all populations the respective areas are included. Defaults to 'complete'. resolution : float, optional Resolution of the population rate. Defaults to 1 ms. """ default_dict = { 'areas': self.areas_loaded, 'pops': 'complete', 'resolution': 1.0 } params = ah._create_parameter_dict(default_dict, self.T, **keywords) iterator = ah.model_iter(mode='single', areas=params['areas'], pops=params['pops']) # Check if synchrony values have been stored with the same parameters self.synchrony = ah._check_stored_data( os.path.join(self.output_dir, 'synchrony.json'), copy(iterator), params) if self.synchrony is None: print("Computing synchrony") d = nested_dict() d['Parameters'] = params for area, pop in iterator: if pop in self.network.structure[area]: d[area][pop] = ah.synchrony( self.spike_data[area][pop], self.network.N[area][pop], params['t_min'], params['t_max'], resolution=params['resolution']) else: d[area][pop] = np.nan for area in params['areas']: total_spikes = ah.area_spike_train(self.spike_data[area]) d[area]['total'] = ah.synchrony( total_spikes, self.network.N[area]['total'], params['t_min'], params['t_max'], resolution=params['resolution']) self.synchrony = d.to_dict()
def flatten_stats(stats): return OrderedDict(('.'.join(k), v) for k, v in nested_dict(stats).iteritems_flat())
def create_pop_rates(self, **keywords): """ Calculate time-averaged population rates and store them in member pop_rates. If the rates had previously been stored with the same parameters, they are loaded from file. Parameters ---------- t_min : float, optional Minimal time in ms of the simulation to take into account for the calculation. Defaults to 500 ms. t_max : float, optional Maximal time in ms of the simulation to take into account for the calculation. Defaults to the simulation time. compute_stat : bool, optional If set to true, the mean and variance of the population rate is calculated. Defaults to False. Caution: Setting to True slows down the computation. areas : list, optional Which areas to include in the calculcation. Defaults to all loaded areas. pops : list or {'complete'}, optional Which populations to include in the calculation. If set to 'complete', all populations the respective areas are included. Defaults to 'complete'. """ default_dict = { 'areas': self.areas_loaded, 'pops': 'complete', 'compute_stat': False } params = ah._create_parameter_dict(default_dict, self.T, **keywords) iterator = ah.model_iter(mode='single', areas=params['areas'], pops=params['pops']) # Check if population rates have been stored with the same parameters fp = os.path.join(self.output_dir, 'pop_rates.json') self.pop_rates = ah._check_stored_data(fp, copy(iterator), params) if self.pop_rates is None: print("Computing population rates") d = nested_dict() d['Parameters'] = params if params['compute_stat']: for area in params['areas']: if params['pops'] == 'complete': pops = self.network.structure[area] else: pops = params['pops'] total_rates = [] for pop in pops: rate = ah.pop_rate(self.spike_data[area][pop], params['t_min'], params['t_max'], self.network.N[area][pop]) d[area][pop] = (rate[0], rate[1]) total_rates += rate[2] d[area]['total'] = (np.mean(total_rates), np.std(total_rates)) else: for area, pop in iterator: if pop in self.network.structure[area]: spikes = self.spike_data[area][pop][:, 1] indices = np.where( np.logical_and(spikes > params['t_min'], spikes < params['t_max'])) d[area][pop] = ( indices[0].size / (self.network.N[area][pop] * (params['t_max'] - params['t_min']) / 1000.0), np.nan) else: d[area][pop] = (0., 0.) for area in params['areas']: total_spikes = ah.area_spike_train(self.spike_data[area]) indices = np.where( np.logical_and(total_spikes[:, 1] > params['t_min'], total_spikes[:, 1] < params['t_max'])) d[area]['total'] = total_spikes[:, 1][indices].size / ( self.network.N[area]['total'] * (params['t_max'] - params['t_min']) / 1000.0) self.pop_rates = d.to_dict()
def deanonymize(self, G, target, path, amt, dl): pq = PriorityQueue() delays = {} costs = {} paths = nd.nested_dict() paths1 = nd.nested_dict() dists = {} visited = set() previous = {} done = {} prob = {} sources = [] pre = path[0] adv = path[1] nxt = path[2] for node in G.nodes(): previous[node] = -1 delays[node] = -1 costs[node] = inf paths[node] = [] dists[node] = inf done[node] = 0 paths1[node] = [] prob[node] = 1 dists[target] = 0 paths[target] = [target] costs[target] = amt delays[target] = dl pq.put((dists[target], target)) flag1 = 0 flag2 = 0 while(0 != pq.qsize()): curr_cost, curr = pq.get() if curr_cost > dists[curr]: continue visited.add(curr) for [v, curr] in G.in_edges(curr): if (G.edges[v, curr]["Balance"] + G.edges[curr, v]["Balance"] >= costs[curr]) and v not in visited: if done[v] == 0: paths1[v] = [v]+paths[curr] done[v] = 1 cost = dists[curr] + self.cost_function(G, costs[curr], curr, v) if cost < dists[v]: paths[v] = [v]+paths[curr] dists[v] = cost delays[v] = delays[curr] + G.edges[v, curr]["Delay"] costs[v] = costs[curr] + G.edges[v, curr]["BaseFee"] + \ costs[curr] * G.edges[v, curr]["FeeRate"] pq.put((dists[v], v)) if(curr in path[1:]): ind = path.index(curr) """ if(paths[curr]!=path[ind:]): return None """ # Check if the current optimal path could be trasformed into the suboptimal path considered if random hops were added. if self.is_not_possible_mod(path[ind:], paths[curr]): return None # """ if curr == adv: flag1 = 1 """ if(curr == pre): if paths[pre] != path: return [pre] else: sources.append(pre) flag2 = 1 """ # Due to the fact that suboptimal path are now being used this assumption has been removed to avoid large amounts of false positives. # Also bugs where the sender chooses a suboptimal path because of low forward balance while having a faster path with a large capacity channel are avoided. if (curr == pre): sources.append(pre) flag2 = 1 # """ if flag1 == 1 and flag2 == 1: if pre in paths[curr]: for [v, curr] in G.in_edges(curr): if v not in paths[curr]: sources.append(v) sources = set(sources) return sources
def create_rate_time_series(self, **keywords): """ Calculate time series of population- and area-averaged firing rates. Uses ah.pop_rate_time_series. If the rates have previously been stored with the same parameters, they are loaded from file. Parameters ---------- t_min : float, optional Minimal time in ms of the simulation to take into account for the calculation. Defaults to 500 ms. t_max : float, optional Maximal time in ms of the simulation to take into account for the calculation. Defaults to the simulation time. areas : list, optional Which areas to include in the calculcation. Defaults to all loaded areas. pops : list or {'complete'}, optional Which populations to include in the calculation. If set to 'complete', all populations the respective areas are included. Defaults to 'complete'. kernel : {'gauss_time_window', 'alpha_time_window', 'rect_time_window'}, optional Specifies the kernel to be convolved with the spike histogram. Defaults to 'binned', which corresponds to no convolution. resolution: float, optional Width of the convolution kernel. Specifically it correponds to: - 'binned' : bin width of the histogram - 'gauss_time_window' : sigma - 'alpha_time_window' : time constant of the alpha function - 'rect_time_window' : width of the moving rectangular function """ default_dict = { 'areas': self.areas_loaded, 'pops': 'complete', 'kernel': 'binned', 'resolution': 1.0 } params = ah._create_parameter_dict(default_dict, self.T, **keywords) # Check if firing rates have been stored with the same parameters fp = os.path.join(self.output_dir, 'rate_time_series') iterator_areas = ah.model_iter(mode='single', areas=params['areas'], pops=None) iterator_pops = ah.model_iter(mode='single', areas=params['areas'], pops=params['pops']) self.rate_time_series = ah._check_stored_data(fp, copy(iterator_areas), params) fp = os.path.join(self.output_dir, 'rate_time_series_pops') self.rate_time_series_pops = ah._check_stored_data( fp, copy(iterator_pops), params) if self.rate_time_series is None: print('Computing rate time series') # calculate area-averaged firing rates d = nested_dict() d['Parameters'] = params # population-averaged firing rates d_pops = nested_dict() d_pops['Parameters'] = params for area, pop in iterator_pops: if pop in self.network.structure[area]: time_series = ah.pop_rate_time_series( self.spike_data[area][pop], self.network.N[area][pop], params['t_min'], params['t_max'], params['resolution'], kernel=params['kernel']) else: time_series = np.nan * np.ones(params['t_max'] - params['t_min']) d_pops[area][pop] = time_series total_spikes = ah.area_spike_train(self.spike_data[area]) time_series = ah.pop_rate_time_series( total_spikes, self.network.N[area]['total'], params['t_min'], params['t_max'], params['resolution'], kernel=params['kernel']) d[area] = time_series self.rate_time_series_pops = d_pops.to_dict() self.rate_time_series = d.to_dict()
def flatten(params): return { '.'.join(k): v for k, v in nested_dict(params).items_flat() if v is not None }
return result if baseline_output == "baseline-result.txt": result = get_y_true_y_prediction(title_score_dict) result.append(len(words_dict)) generate_vocab_file(words_dict) generate_remove_word_file() return result # Specify filename file_name = "hns_2018_2019.csv" # initialize dictionary of performance when removing top x%, # Key: words left in vocab Value: precision, recall, accuracy, f-mesure performance_in_percent_dict = nd.nested_dict() # initialize dictionary of performance when removing words with x frequency , # Key: words left in vocab Value: precision, recall, accuracy, f-mesure performance_dict = nd.nested_dict() # Part 1 & Part 2 stop_words = [] baseline_performance = main(file_name, stop_words, "model-2018.txt", "baseline-result.txt") performance_dict = get_performance(performance_dict, baseline_performance[1], baseline_performance[0], baseline_performance[2]) performance_in_percent_dict = get_performance(performance_in_percent_dict, baseline_performance[1], baseline_performance[0],
def dest_reveal_new(G, adversary, delay, amount, pre, next): T = nd.nested_dict() flag1 = True anon_sets = nd.nested_dict() level = 0 index = 0 # Level 0 only contains the next node T[0]["nodes"] = [next] T[0]["delays"] = [delay] print(delay) T[0]["previous"] = [-1] T[0]["visited"] = [[pre, adversary, next]] T[0]["amounts"] = [amount] x = -1 paths = nd.nested_dict() num_paths = 0 # flag to indicate that going further would result only in invalid nodes as the delay limit is reached for all nodes in the current level flag = True while (flag): level += 1 # Stop when level is greater than 3 - it takes forever otherwise if (level == 4): flag1 = False break t1 = T[level - 1]["nodes"] d1 = T[level - 1]["delays"] p1 = T[level - 1]["previous"] v1 = T[level - 1]["visited"] a1 = T[level - 1]["amounts"] pr1 = T[level - 1]["probs"] t2 = [] d2 = [] p2 = [] v2 = [[]] a2 = [] pr2 = [] for i in range(0, len(t1)): u = t1[i] for [u, v] in G.out_edges(u): # Checks if v is not repeating in the same path, delay limit is not reached after visiting v and the capacity condition is true after deducting fees if (v != pre and v != adversary and v != next and v not in v1[i] and (d1[i] - G.edges[u, v]["Delay"]) >= 0 and (G.edges[u, v]["Balance"] + G.edges[v, u]["Balance"]) >= ((a1[i] - G.edges[u, v]["BaseFee"]) / (1 + G.edges[u, v]["FeeRate"]))): t2.append(v) d2.append(d1[i] - G.edges[u, v]["Delay"]) p2.append(i) v2.append(v1[i] + [v]) a2.append(((a1[i] - G.edges[u, v]["BaseFee"]) / (1 + G.edges[u, v]["FeeRate"]))) T[level]["nodes"] = t2 #print(level,t2,d2) T[level]["delays"] = d2 T[level]["previous"] = p2 T[level]["visited"] = v2 T[level]["amounts"] = a2 #T[level]["probs"] = pr2 #print(t2,d2,p2) print(level, len(t2)) # Stop if the current level has 0 nodes if (len(t2) == 0): flag = False level = level - 1 while (level >= 0): t = T[level]["nodes"] d = T[level]["delays"] p = T[level]["previous"] a = T[level]["amounts"] v = T[level]["visited"] #print(level) for i in range(0, len(t)): # Potential destination if delay is 0 if (d[i] == 0): #construct the path found from the next node to the destination path = [] level1 = level path.append(T[level1]["nodes"][i]) loc = T[level1]["previous"][i] while (level1 > 0): level1 = level1 - 1 path.append(T[level1]["nodes"][loc]) loc = T[level1]["previous"][loc] path.reverse() # Add pre and adversary to the start of the path path = [pre, adversary] + path # Double check that path is loop free if (len(path) == len(set(path))): #print(path, level) amt = a[i] pot = path[len(path) - 1] # For each destination find the sources that would use this subpath using either lnd,c-lightning or eclair sources_lnd = deanonymize_lnd(G, pot, path, amt) if sources_lnd != []: print("match", pot, "lnd") anon_sets[pot]["lnd"] = list(sources_lnd) # Check for more fuzz values only if the anonymity sets do not match for fuzz values -1 and 1 fuzz = -0.8 sources_c = deanonymize_c(G, pot, path, amt, -1) sources_c1 = deanonymize_c(G, pot, path, amt, 1) if (sources_c1 != sources_c): sources_c = sources_c + sources_c1 while fuzz <= 0.8: s = deanonymize_c(G, pot, path, amt, fuzz) if (s != []): sources_c = sources_c + s fuzz += 0.2 sources_c = list(set(sources_c)) if sources_c != []: print("match", pot, "c", fuzz) anon_sets[pot]["c"] = list(set(sources_c)) sources_ecl = deanonymize_ecl(G, pot, path, amt) if sources_ecl != []: print("match", pot, "ecl") anon_sets[pot]["ecl"] = list(sources_ecl) level = level - 1 return anon_sets, flag1
def ic_density(bed=None, cut_num_ls=None, savefn=None, split_overlap_ratio_min=0.5, sample='cell1_cell4'): if bed is None: bed = '/Share/home/zhangqf7/gongjing/zebrafish/result/dynamic_merge_region/005_005_new/abs/%s/window-anno.bed' % ( sample) if cut_num_ls is None: cut_num_ls = [20, 200, 80] if savefn is None: savefn = '/Share/home/zhangqf7/gongjing/zebrafish/result/icshape_signal_mean/sample_%s_dynamic_density.txt' % ( sample) trans_dict = loadTransGtfBed2( '/Share/home/zhangqf7/gongjing/zebrafish/data/reference/transcriptome/danRer10.refSeq.transcriptome.trans.bed2' ) bed_region_dict = nested_dict(1, list) with open(bed, 'r') as BED: for line in BED: line = line.strip() if not line or line.startswith('#'): continue arr = line.split('\t') if [int(arr[1]), int(arr[2])] not in bed_region_dict[arr[0]]: # if [arr[1], arr[2]] not in bed_region_dict[arr[0]]: # [arr[1], arr[2]] is str, while [int(arr[1]), int(arr[2])] is int bed_region_dict[arr[0]].append([int(arr[1]), int(arr[2])]) print bed_region_dict['NM_205538'] anno = bed.replace('.bed', '.element.txt') ANNO = open(anno, 'w') fa_dict = read_fa() seq_savefn = savefn.replace('.txt', '.seq.txt') SEQ = open(seq_savefn, 'w') with open(savefn, 'w') as SAVEFN: for tx, j in bed_region_dict.items(): if not trans_dict.has_key(tx): continue tx_element_count = [0] * sum(cut_num_ls) utr_5_start, utr_5_end, cds_start, cds_end, utr_3_start, utr_3_end = [ int(trans_dict[tx][g]) for g in [ 'utr_5_start', 'utr_5_end', 'cds_start', 'cds_end', 'utr_3_start', 'utr_3_end' ] ] if utr_5_end < cut_num_ls[0]: continue if cds_end - cds_start + 1 < cut_num_ls[1]: continue if utr_3_end - utr_3_start + 1 < cut_num_ls[2]: continue utr_5_split = list_split_equal(xrange(utr_5_start, utr_5_end + 1), cut_num_ls[0]) cds_split = list_split_equal(xrange(cds_start, cds_end + 1), cut_num_ls[1]) utr_3_split = list_split_equal(xrange(utr_3_start, utr_3_end + 1), cut_num_ls[2]) all_split = utr_5_split + cds_split + utr_3_split # 1-based for (j_start, j_end) in j: for n, (split_start, split_end) in enumerate(all_split): if max(j_start, split_start) < min(j_end, split_end): overlap_len = min(j_end, split_end) - max( j_start, split_start) + 1 split_overlap_ratio = overlap_len / float(split_end - split_start + 1) if split_overlap_ratio >= split_overlap_ratio_min: tx_element_count[n] += 1 print >> ANNO, '\t'.join( map(str, [ tx, j_start, j_end, split_start, split_end, n, overlap_len, split_overlap_ratio ])) print >> SAVEFN, '\t'.join(map(str, [tx] + tx_element_count)) tx_a_content_cout = [0] * sum(cut_num_ls) tx_t_content_cout = [0] * sum(cut_num_ls) tx_c_content_cout = [0] * sum(cut_num_ls) tx_g_content_cout = [0] * sum(cut_num_ls) for n, (split_start, split_end) in enumerate(all_split): a = fa_dict[tx][split_start - 1:split_end].upper().count('A') / float( len(fa_dict[tx][split_start - 1:split_end].upper())) t = fa_dict[tx][split_start - 1:split_end].upper().count('T') / float( len(fa_dict[tx][split_start - 1:split_end].upper())) c = fa_dict[tx][split_start - 1:split_end].upper().count('C') / float( len(fa_dict[tx][split_start - 1:split_end].upper())) g = fa_dict[tx][split_start - 1:split_end].upper().count('G') / float( len(fa_dict[tx][split_start - 1:split_end].upper())) tx_a_content_cout[n] = a tx_t_content_cout[n] = t tx_c_content_cout[n] = c tx_g_content_cout[n] = g print >> SEQ, '\t'.join(map(str, [tx, 'A'] + tx_a_content_cout)) print >> SEQ, '\t'.join(map(str, [tx, 'T'] + tx_t_content_cout)) print >> SEQ, '\t'.join(map(str, [tx, 'C'] + tx_c_content_cout)) print >> SEQ, '\t'.join(map(str, [tx, 'G'] + tx_g_content_cout)) ANNO.close() SEQ.close() return savefn
def flatten(params): return {'.'.join(k): v for k, v in nested_dict(params).items_flat() if v is not None}
def resnet(depth, width, num_classes): assert (depth - 4) % 6 == 0, 'depth should be 6n+4' n = (depth - 4) // 6 widths = torch.Tensor([16, 32, 64]).mul(width).int() def gen_block_params(ni, no): return { 'conv0': conv_params(ni, no, 3), 'conv1': conv_params(no, no, 3), 'bn0': bnparams(ni), 'bn1': bnparams(no), 'convdim': conv_params(ni, no, 1) if ni != no else None, } def gen_group_params(ni, no, count): return {'block%d'%i: gen_block_params(ni if i==0 else no, no) for i in range(count)} def gen_group_stats(ni, no, count): return {'block%d'%i: {'bn0': bnstats(ni if i==0 else no), 'bn1': bnstats(no)} for i in range(count)} params = nested_dict({ 'conv0': conv_params(3,16,3), 'group0': gen_group_params(16, widths[0], n), 'group1': gen_group_params(widths[0], widths[1], n), 'group2': gen_group_params(widths[1], widths[2], n), 'bn': bnparams(widths[2]), 'fc': linear_params(widths[2], num_classes), }) stats = nested_dict({ 'group0': gen_group_stats(16, widths[0], n), 'group1': gen_group_stats(widths[0], widths[1], n), 'group2': gen_group_stats(widths[1], widths[2], n), 'bn': bnstats(widths[2]), }) flat_params = OrderedDict() flat_stats = OrderedDict() for keys,v in params.iteritems_flat(): if v is not None: flat_params['.'.join(keys)] = Variable(v, requires_grad=True) for keys,v in stats.iteritems_flat(): flat_stats['.'.join(keys)] = v def activation(x, params, stats, base, mode): return F.relu(F.batch_norm(x, weight = params[base+'.weight'], bias = params[base+'.bias'], running_mean = stats[base+'.running_mean'], running_var = stats[base+'.running_var'], training = mode, momentum = 0.1, eps = 1e-5)) def block(x, params, stats, base, mode, stride): o1 = activation(x, params, stats, base+'.bn0', mode) y = F.conv2d(o1, params[base+'.conv0'], stride=stride, padding=1) o2 = activation(y, params, stats, base+'.bn1', mode) z = F.conv2d(o2, params[base+'.conv1'], stride=1, padding=1) if base + '.convdim' in params: return z + F.conv2d(o1, params[base+'.convdim'], stride=stride) else: return z + x def group(o, params, stats, base, mode, stride): for i in range(n): o = block(o, params, stats, '%s.block%d'%(base,i), mode, stride if i==0 else 1) return o def f(input, params, stats, mode, prefix=''): x = F.conv2d(input, params[prefix+'conv0'], padding=1) g0 = group(x, params, stats, prefix+'group0', mode, 1) g1 = group(g0, params, stats, prefix+'group1', mode, 2) g2 = group(g1, params, stats, prefix+'group2', mode, 2) o = activation(g2, params, stats, prefix+'bn', mode) o = F.avg_pool2d(o, 8, 1, 0) o = o.view(o.size(0), -1) o = F.linear(o, params[prefix+'fc.weight'], params[prefix+'fc.bias']) return o, [g0, g1, g2] return f, flat_params, flat_stats