def encode(prog_ast,encoding,bq=None): """ Create an optimized binary encoding of the input program in a BitQueue. """ if bq is None: bq=bitqueue.BitQueue() code_table = ddict(int) #comcounts is an encoding dictionary of a list of function dictionaries. enccounts0,enccounts1,enccounts2 = comcounts[encoding] #then, group up functions based on rank within arity and across signatures of the same arity, and create a mapping from the "group name" to their combined counts, and likewise from a signature to its groupname. from this combined list, make a new huffman tree. #handle nullary functions first (they go straight in the table because they take no arguments and are already in the right format) code_table.update(enccounts0) #now unary functions #enccounts1 maps argumenttypestr to dictionaries which map function names to counts. unarygroupmap = ddict(int) for astr,funcmap in enccounts1.items(): for i,func in enumerate(sorted(list(funcmap),key=lambda x: funcmap[x])): code_table["unarygroup"+str(i)]+=funcmap[func] unarygroupmap[func].append(i) #now binary functions binarygroupmap = ddict(int) for astr,funcmap in enccounts2.items(): for i,func in enumerate(sorted(list(funcmap),key=lambda x: funcmap[x])): code_table["binarygroup"+str(i)]+=funcmap[func] unarygroupmap[func].append(i) h = HuffmanTree(code_table) #now that we have the huffman tree built, we need to find the ideal positions of everything in the output program
def __init__(self, data): self.data = data need_authors = get_need_authors(self.data) need_papers = get_need_papers(self.data) paper_author_pacnt = ddict(lambda: ddict(int)) author_paperset = dict() paper_author_name = ddict(lambda: ddict(set)) for (paperid, authorid, name, affi) in self.data.paperauthor_tuples: if paperid in need_papers: paper_author_pacnt[paperid][authorid] += 1 if len(name) > 0: paper_author_name[paperid][authorid].add(name) for paperid,author_pacnt in paper_author_pacnt.items(): for author, pacnt in author_pacnt.items(): if author in need_authors or pacnt == 2: author_paperset[author] = set() for (paperid, authorid, name, affi) in self.data.paperauthor_tuples: if authorid in author_paperset: author_paperset[authorid].add(paperid) self.author_paperset = author_paperset self.paper_author_pacnt = paper_author_pacnt self.paper_author_name = paper_author_name return
def run(trainfile, doc_mappid_file, out, num_mapper): docid_mapperid = load_docid_mapperid(doc_mappid_file) mapperid_cnt = ddict(int) for mapperid in docid_mapperid.values(): mapperid_cnt[mapperid] += 1 mapperid_partid = dict() partid_cnt = ddict(int) for mappid in sorted(mapperid_cnt.keys(), key=lambda x:-mapperid_cnt[x]): min_partid = -1 for partid in range(num_mapper): if min_partid == -1 or partid_cnt[partid] < partid_cnt[min_partid]: min_partid = partid mapperid_partid[mappid] = min_partid partid_cnt[min_partid] += mapperid_cnt[mappid] fps = dict() for partid in mapperid_partid.values(): fps[partid] = open('%s/part-%d' % (out, partid),'w') num_line = 0 for line in file(trainfile): mappid = docid_mapperid[num_line] print >> fps[mapperid_partid[mappid]], '%d %s' % (num_line, line[:-1]) num_line += 1 for fp in fps.values(): fp.close()
def run_main(valid_csv, valid_gt_csv): csv_reader = csv.reader(file(valid_csv)) csv_reader.next() author_papercnt = ddict(lambda :ddict(int)) for cols in csv_reader: authorid = int(cols[0]) for paperid in map(int, cols[1].split()): author_papercnt[authorid][paperid] += 1 csv_reader = csv.reader(file(valid_gt_csv)) csv_reader.next() print 'Authorids,ConfirmedPapers,DeletedPapers' for cols in csv_reader: authorid = int(cols[0]) confirmedpapers = list() for paperid in map(int, cols[1].split()): confirmedpapers.append(paperid) author_papercnt[authorid][paperid] -= 1 deletedpapers = list() for paperid, cnt in author_papercnt[authorid].items(): if cnt < 0: print >> sys.stderr, 'error' if cnt > 0: for i in range(cnt): deletedpapers.append(paperid) print '%d,%s,%s' % (authorid, ' '.join(map(str,confirmedpapers)), ' '.join(map(str,deletedpapers)))
def check_links(): print "\nMissing Links" outs = [] refs = ddict(lambda: ddict(set)) for fn in glob('doc/*/*.html') + glob('doc/*/*/*.html'): # print fn soup = parse(fn) for link in soup.select('[href]'): href = link['href'] url = absify(fn, href) if url.startswith('http'): outs.append(url) else: if '#' in url: pg, anchor = url.split('#') refs[pg]['anchors'].add('#'+anchor) url = pg refs[url]['refrs'].add(fn) # print "- %s :: <%s>" % (url, link['href']) refs = {k:dict((kind, sorted(lst)) for kind, lst in v.items()) for k,v in refs.items()} # broken links for fn,info in sorted(refs.items()): if not os.path.exists(fn): print "!",fn,'<-',info['refrs']
def get_kegg_reactions(): """ :return: """ import multiprocessing rp_record_by_id = ddict(lambda: ddict(set)) reac_ids = kegg_parser.reactionIds print("# reacids: {0}".format(len(reac_ids))) p = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = p.map(get_compounds, reac_ids, chunksize=20) for reactants_ids, product_ids in t: for id__ in reactants_ids: for id_ in product_ids: rp_record_by_id[id__]['as_r'].add(id_) rp_record_by_id[id_]['as_p'].add(id__) # transform value to list for key in rp_record_by_id: v_r = rp_record_by_id[key]['as_r'] rp_record_by_id[key]['as_r'] = list(v_r) v_p = rp_record_by_id[key]['as_p'] rp_record_by_id[key]['as_p'] = list(v_p) print("len rp record: {}".format(len(rp_record_by_id))) return rp_record_by_id
def _aggregate(self, kw): event_types = db.get_event_types() event_types_lookup = dict((id, name) for name, id in event_types.items()) # Build the return value in this object #State # CL region # Daterange #eventtype rv = ddict(lambda: ddict(lambda: ddict(lambda: ddict( lambda: dict((e,0) for e in kw['counts']))))) # And the actual counts... for event in db.get_all_events(): if event['event_id_obfuscated'] in self.anomalous: continue if event_types_lookup[event['event_type_id']] in self.hidden_events: continue interval = bisect.bisect(kw['timebreaks'], event[kw['time_type']]) if interval == 0 or interval == len(kw['timebreaks']): # Event lies outside requested time intervals continue # If we decide to make smaller queries from the front end, # a lot of this filtering could be moved into the sql # query in db.py to make it faster. But it's clearer here. if not ((event['venue_state_cd'] in kw['states']) and (event['clregion'] in kw['clregions']) and (event['venue_zip'] in kw['zips']) and (event['event_type_id'] in kw['event_types'])): continue datestring = datetime.strftime(kw['timebreaks'][interval-1], '%Y-%m-%d') event_type = event_types_lookup[int(event['event_type_id'])] ccounts = rv[event['venue_state_cd']][event['clregion']][datestring][event_type] for counttype in kw['counts']: summand = {'count': 1, 'rsvp': int(event['attendee_count'])}[counttype] ccounts[counttype] += summand return deddict(rv)
def __init__ (self): self._next_key = 0 self._edges = ddict(lambda:ddict(lambda:ddict(lambda:{}))) # node -> node -> key -> {attr} self._nodes = {}
def check_links(): print "\nMissing Links" outs = [] refs = ddict(lambda: ddict(set)) for fn in glob("doc/*/*.html") + glob("doc/*/*/*.html"): # print fn soup = parse(fn) for link in soup.select("[href]"): href = link["href"] url = absify(fn, href) if url.startswith("http"): outs.append(url) else: if "#" in url: pg, anchor = url.split("#") refs[pg]["anchors"].add("#" + anchor) url = pg refs[url]["refrs"].add(fn) # print "- %s :: <%s>" % (url, link['href']) refs = {k: dict((kind, sorted(lst)) for kind, lst in v.items()) for k, v in refs.items()} # broken links for fn, info in sorted(refs.items()): if not os.path.exists(fn): print "!", fn, "<-", info["refrs"]
def _compute_mult(self, node): W = ddict(int) rW = ddict(list) for g in node.get_children(): s = self.lcamap[g] rW[s.name].append(g) W[s.name] += 1 return W, rW
def __init__(self, *args, **kwargs): super(MyController, self).__init__(*args, **kwargs) # Mandatory self.mac_tables = ddict(dict) # {DPID => {MAC => PORT}} self.arp_table = dict() # {IP => MAC} self.switchports = ddict(dict) # {DPID => {PORTID => PORT_STATUS}} self.stats_requester_thread = hub.spawn(self._port_stats_requester) self.switches = {} #{DPID => SWITCH}
def infer_assignment_probabilities(self, n_samples=200, n_burning_sample=10): """ Main function :param n_samples: :param n_burning_sample: """ assert n_samples > n_burning_sample, "n_burning_samples seems to be > to n_samples" #assign obvious self._assign_without_ambiguity() #g et counter for each feature in order to store sampling # mapping between annotation and its metabolite # may be useful to retrieve probabilities and assign # it the posterior prbability annotation_by_metab_id_by_feature = ddict(dict) # dict for counting the sampling result in each iteration # map a feature and collections.Counter counter_by_feature = ddict(Counter) # fill the 2 previous declared dicts for f in self.features: annotations = f.annotations #counter = Counter() for a in annotations: m = a.metabolite annotation_by_metab_id_by_feature[f][m.kegg_id] = a # we are counting the string #counter[m.kegg_id] #= 0 counter_by_feature[f][m.kegg_id] = 0 #= counter # initalization of prior probabilities logging.info("Init probabilities...") prob_by_metab_by_feature = self._init_probs() for i in xrange(n_samples): assigned_compounds = self.assigned_compounds rdm.shuffle(self.features) #for f in self.features: #rdm.shuffle(self.features): self._sample(assigned_compounds, prob_by_metab_by_feature, counter_by_feature) self._update_probs(assigned_compounds, prob_by_metab_by_feature) sys.stdout.write("Progression:[*** " + str(int(round(float(i) / n_samples * 100))) + "% ***]" + chr(13)) logging.info("Computing posterior probabilities...") # posterior probabilities calculation self._calc_posterior_probs(counter_by_feature, prob_by_metab_by_feature, n_samples, n_burning_sample) for f, prob_by_metab in prob_by_metab_by_feature.iteritems(): for metab_id, prob in prob_by_metab.iteritems(): if math.isnan(prob): raise ValueError("nan probability") annotation_by_metab_id_by_feature[f][metab_id].score_network = prob logging.info("Done.")
def read_conference_jorunal_csv(self, conference_csv): csv_reader = csv.reader(file(conference_csv, 'r')) csv_reader.next() cid_info_dict = ddict(lambda:ddict()) for cols in csv_reader: cid = int(cols[0]) cid_info_dict[cid]['shortname'] = cols[1] cid_info_dict[cid]['longname'] = cols[2] return cid_info_dict
def viterbi(sentence, tags, transition, emission): probs = ddict(lambda: ddict(lambda: -1)) probs[-1]['<begin>'] = 1.0 backpt = ddict(lambda: {}) i = -1 for i, word in enumerate(sentence): for tag, cur_count in tags.items(): path_probs = {} if i == 0: prev_tag = '<begin>' tr = prev_tag+'_'+tag if tr in transition: trans = float(transition[tr]) else: trans = -1 em = tag+'_'+word if em in emission: emiss = float(emission[em])/cur_count else: emiss = -1 path_probs[tag] = (probs[i-1][prev_tag] * trans * emiss) else: for prev_tag, prev_count in tags.items(): tr = prev_tag+'_'+tag if tr in transition: trans = float(transition[tr])/prev_count else: trans = -1 em = tag+'_'+word if em in emission: emiss = float(emission[em])/cur_count else: emiss = -1 path_probs[prev_tag] = (probs[i-1][prev_tag] * trans * emiss) best_tag = max(path_probs, key=path_probs.get) probs[i][tag] = path_probs[best_tag] backpt[i][tag] = best_tag i += 1 for tag, cur_count in tags.items(): tr = tag + '_' + '<finish>' if tr in transition: trans = float(transition[tr])/cur_count else: trans = -1 emiss = 1 path_probs[tag] = (probs[i-1][tag] * trans * emiss) best_tag = max(path_probs, key=path_probs.get) probs[i][tag] = path_probs[best_tag] backpt[i][tag] = best_tag currrent_tag = max(probs[i], key=probs[i].get) predicted_tags = [] for j in range(i,-1,-1): predicted_tags.append(currrent_tag) currrent_tag = backpt[j][currrent_tag] predicted_tags.reverse() return predicted_tags[:-1]
def f90deps(srcs,directory=None): files = [] exts = [ ".f90", ".F90", ".F", ".f", ".f77", ".f03", ".F03", ".f08", ".F08" ] for src in srcs: fileName, fileExtension = os.path.splitext(src) if fileExtension in exts: files.append(src) srcs = files use_line_re = re.compile(r"^\s*use\s+(\S.+)\s*$",re.IGNORECASE) mod_line_re = re.compile(r"^\s*module\s+(\S+)\s*$",re.IGNORECASE) cont_line_re = re.compile(r"^(.*)&\s*$") split_re = re.compile(r"\s*,\s*") dep_re = re.compile(r"(.*)") mod_re = re.compile(r"(.*)") info = ddict() for src in srcs: info[src] = f90depinfo() if directory is not None and directory != "": fh = file(directory + "/" + src,"r") else: fh = file(src,"r") liter = iter(fh) while True: try: line = getline(liter) has_use = re.match( use_line_re, line ) has_mod = re.match( mod_line_re, line ) if has_use is not None: for mod in has_use.group(1).split(","): info[src].uses[ mod.strip() ] = None elif has_mod is not None: info[src].provides[ has_mod.group(1).strip() ] = None except Exception as e: # print "exception: ",e break modules = ddict() for src in srcs: for m in info[src].provides: modules[m] = src for src in srcs: tmp = copy.deepcopy( info[src].uses ) for m in info[src].uses: if not m in modules: tmp.pop(m,None) else: tmp[m] = modules[m] for m in info[src].provides: if m in tmp: tmp.pop(m,None) info[src].uses = tmp return info
def __init__(self, data): self.data = data pid_uid_name = ddict(lambda:ddict(lambda:"")) pid_uid_affi = ddict(lambda:ddict(lambda:"")) for (pid, uid, name, affi) in data.paperauthor_tuples: pid_uid_name[pid][uid] = name pid_uid_affi[pid][uid] = affi self.pid_uid_name = pid_uid_name self.pid_uid_affi = pid_uid_affi return
def run_main(submission_dir, submission_list_file, output): submissions = [ submission_dir + '/' + x.strip() for x in file(submission_list_file) if len(x.strip()) > 0 and x.strip()[0] != '#' ] author_paperrank = ddict(lambda: ddict(float)) for submission in submissions: read_submission(submission, author_paperrank) with open(output, 'w') as fw: print >> fw, 'AuthorId,PaperIds' for authorid, paper_score in author_paperrank.items(): print >> fw, '%d,%s' % (authorid, ' '.join(map(lambda x:str(x[0]), sorted(paper_score.items(), key=lambda x:-x[1]))))
def train(self, tagged_sentences): # count number of times a word is given each tag word_tag_counts = ddict(lambda: ddict(lambda: 0)) for sent in tagged_sentences: for (word, tag) in sent: word_tag_counts[word][tag] += 1 # select the tag used most often for the word for word in word_tag_counts: tag_counts = word_tag_counts[word] tag = max(tag_counts, key=tag_counts.get) self._word_tags[word] = tag
def __init__(self,pdbfile,mode='3D',chains=None,MaxAtomsPerLeaf=96,DunbrackNaming=False,BSPTree=True): if not os.path.exists(pdbfile): raise InputError('File does not exist!') sys.exit(1) else: startTime = time.time() self.filename = str(pdbfile) self.hash = {} self.stats = ddict(int) self.geoCentre = p3d.vector.Vector() self.massCentre = p3d.vector.Vector() self.atoms = [] # storage for all Atom objects self.chainTermini = ddict(list) self.init_hashes() self.resolution = 0.01 self.leftOvers = [] self.atomInfoIndex = None self.header = [] self.headers_infos = ddict(list) self.conect = [] # --*-- remove path info from pdb filename --*-- self.fullname = str(pdbfile) if len(pdbfile.split('/')) == 0 else str(pdbfile.split('/')[-1]) # --*-- head of family flag used for high throuput to distinguish redundant and non-redundant set --*-- self.HOF = False if len(self.fullname.split('.')[0]) <= 4: self.id = self.fullname[:4] self.dunbrackChain = None elif len(self.fullname.split('.')[0]) == 5 and DunbrackNaming == True: # special identifiers in file name, i.e. Dunbrack chain ids self.id = self.fullname[:4] self.dunbrackChain = self.fullname[4:5].upper() if chains == None: chains = list(self.dunbrackChain) elif len(self.fullname.split('.')[0]) == 6 and DunbrackNaming == True: # special identifiers in filename, i.e. Dunbrack chain id and head of family hook if self.fullname[:1] == '_': self.HOF = True self.id = self.fullname[1:5] self.dunbrackChain = self.fullname[5:6] else: self.id = self.fullname.split('.')[0] self.dunbrackChain = None if mode == '3D': self.BSPTree = p3d.tree.Tree(protein=self) self.read_in_3Dstructure(pdbfile,chains=chains) if BSPTree: self.BSPTree.build(MaxAtomsPerLeaf=MaxAtomsPerLeaf) elif mode == '2D': self.read_in_2Dstructure(pdbfile,chains=chains) else: print("mode error") sys.exit(1) self.init_parser() self.time = round(time.time()-startTime,3) return
def start(self): self.is_started=True self.start_time = time.time() self.round_duration = len(color_set) * self.interval_duration self.pools = ddict(_get_color_pool) def color_builder(): (r, i) = calculate_round_interval(self.start_time, self.interval_duration) return self.pools[r].pop() self.colors = ddict(lambda: ddict(color_builder)) # graphics stuff self.prog_bar = utils.ProgressBar(self.size[0], self.size[1] / 10.0, 1.0, bg_color=background_color, border_width=5, border_color=(255,69,0))
def get_intermediate_data(self): need_papers_set = get_need_papers(self.data) authorpaper_count = ddict(int) paper_authorlist = ddict(set) for (paperid, authorid, name, affi) in self.data.paperauthor_tuples: if paperid in need_papers_set: authorpaper_count[(authorid, paperid)] += 1 paper_authorlist[paperid].add(authorid) self.authorpaper_count = authorpaper_count self.paper_authorlist = paper_authorlist return
def process_gen_bonus(self, input_file): bonus_file = "%s/bonus.txt" % self.result_dir months = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] bonuses = ddict(lambda: ddict(int)) data = load_data(input_file) most_recent_day = max([sum(months[0:int(item[3])])+int(item[4]) for item in data]) for record in data: if most_recent_day - (sum(months[0:int(record[3])])+int(record[4])) < self.bonus: user, item = record[0], record[1] bonuses[user][item] += 1 output_mbonus(output_file=bonus_file, bonus=bonuses) return bonus_file
def __init__(self, data): self.data = data need_authors = get_need_authors(self.data) need_papers = get_need_papers(self.data) paper_author_name = ddict(lambda: ddict(set)) for (paperid, authorid, name, affi) in self.data.paperauthor_tuples: if paperid in need_papers: if len(name) > 0: paper_author_name[paperid][authorid].add(name) self.paper_author_name = paper_author_name return
def __init__(self, data): self.data = data need_papers = get_need_papers(self.data) paper_authorlist = ddict(list) pa_cnt = ddict(int) for (paperid, authorid, name, affi) in data.paperauthor_tuples: if paperid in need_papers: paper_authorlist[paperid].append((authorid, name)) pa_cnt[(paperid, authorid)] += 1 self.paper_authorlist = paper_authorlist self.pa_cnt = pa_cnt
def lcseq_len(a, b): m = len(a) n = len(b) c = ddict(lambda:ddict(lambda:0)) for i in range(1, m+1): for j in range(1, n+1): if a[i-1] == b[j-1]: c[i][j] = c[i-1][j-1] + 1 elif c[i-1][j] >= c[i][j-1]: c[i][j] = c[i-1][j] else: c[i][j] = c[i][j-1] return c[m][n]
def __init__(self, data): self.data = data train_authorpaperlist = ddict(list) need_authors = set() for (authorid, paperid, label) in data.train_tuples: train_authorpaperlist[authorid].append(paperid) need_authors.add(authorid) pa_authorpaperlist = ddict(list) for (paperid, authorid, name, affi) in data.paperauthor_tuples: if authorid in need_authors: pa_authorpaperlist[authorid].append(paperid) self.train_authorpaperlist = train_authorpaperlist self.pa_authorpaperlist = pa_authorpaperlist
def get_intermediate_data(self): author_deleted_paperset = ddict(set) author_confirmed_paperset = ddict(set) for (authorid, paperid, label) in self.data.train_tuples: if label == 1: author_confirmed_paperset[authorid].add(paperid) else: author_deleted_paperset[authorid].add(paperid) need_paper_set = get_need_papers(self.data) self.paper_author_list = get_paper_authorlist(self.data, need_paper_set) self.author_deleted_paperset = author_deleted_paperset self.author_confirmed_paperset = author_confirmed_paperset return
def __init__(self, data): self.data = data need_papers = get_need_papers(self.data) need_authors = get_need_authors(self.data) paper_authorlist = ddict(lambda: ddict(list)) author_paperlist = ddict(list) for (paper, author, name, affi) in data.paperauthor_tuples: if paper in need_papers: paper_authorlist[paper][author].append((name, affi)) if author in need_authors: author_paperlist[author].append((paper, name, affi)) self.paper_authorlist = paper_authorlist self.author_paperlist = author_paperlist return
def clusterize_hierarchical(peakels, matrix_dist, cut, clip=False): """ :param clip: :param peakels: :param matrix_dist: :param method: :param cut: """ #having negative value in the matrix distance # leading to a valueerror # clip i order to prevent negative value in the matrix distance if clip: np.clip(matrix_dist, 0, 1, matrix_dist) k = linkage(matrix_dist, method='complete') #dist = maxdists(k) #fit = norm.fit(dist) #cut = np.percentile(dist, 10.0) #norm.ppf(5.0, loc=fit[0], scale=fit[1]) k2 = fcluster(k, cut, criterion='distance') #, criterion='distance') clust_by_id = ddict(list) for i, v in enumerate(k2): clust_by_id[v].append(peakels[i]) return clust_by_id.values()
def all_passing_stats(pDict, func, weight = 'weight'): stats = ddict(dict) for team1 in pDict.keys(): for team2 in pDict[team1].keys(): G = df_to_graph(pDict[team1][team2]) stats[team1][team2] = func(G, weight = weight) return stats
def project_list_submissions(user, event): # get roster indexed by wiscmail roster = { student['net_id'] + "@wisc.edu": student for student in get_roster_json() if 'net_id' in student } project_id = event['project_id'] if not project_id in get_project_due_utc(): return (500, 'invalid project id') paths = s3().s3_all_keys('projects/' + project_id + '/') # email => list of submission IDs submissions = ddict(list) direct_submissions = ddict(list) # emails of students who have recevied test results or CRs on some version of their code reviewed = set() tested = set() for path in paths: parts = path.split('/') # Example: # projects/p2/tylerharter*at*gmail.com/2019-08-16_17-58-37/submission.json (len=5) # projects/p2/tylerharter*at*gmail.com/2019-08-16_17-58-37-link.json (len=4) if len(parts) > 2: email = parts[2].replace("*at*", "@") if len(parts) == 5 and parts[-1] == 'submission.json': direct_submissions[email].append(parts[3]) if len(parts) == 5 and parts[-1] == 'cr.json': reviewed.add(email) if len(parts) == 5 and parts[-1] == 'test.json': tested.add(email) link_suffix = '-link.json' if len(parts) == 4 and parts[-1].endswith(link_suffix): submissions[email].append(parts[3][:-len(link_suffix)]) # we only want links to submissions that have been directly # submitted (in contrast to submissions submitted by a partner), # so we don't review the same thing twice. # # we're also only interested in the most recent submission rows = [] for email in direct_submissions: latest_direct = max(direct_submissions[email]) if len(submissions[email]) == 0: # this should only happen if lambda crashed between S3 writes, or a submission is withdrawn continue latest = max(submissions[email]) if latest_direct != latest: continue # this direct submission has been superceded by a partner's more-recent submission row = { 'project_id': project_id, 'student_email': email, 'submission_id': latest_direct, 'has_review': email in reviewed, 'tested': email in tested, 'info': {}, } # supplement with info from roster for field in ['net_id', 'ta']: row['info'][field] = roster.get(email, {}).get(field, None) rows.append(row) return (200, { 'submissions': rows, 'reviewed': list(reviewed), 'direct': list(direct_submissions.keys()) })
puquxi hmeaehh oxe tasipw qzyg hyvy wcmpwe hvs fxq wvfy zjepsl dvrfxnc xnvg xle crcuc qkhnv crcuc oedez bjw pmwq xzzpiy cjwss jwscs apb bpa ydjhhf yeltadb lwi cjdcb ovaox xrdm vkxub zax xza admbc lvpzfeh auxn rwasj kebx eild nrskdr meja jxczomh gcne""" inData = """ abcde fghij abcde xyz ecdab a ab abc abd abf abj iiii oiii ooii oooi oooo oiii ioii iioi iiio """ myDict = ddict(lambda: 0) validNo = 0 for pwd in inData.split('\n'): myDict.clear() validPwd = True for part in pwd.split(' '): myDict[part] += 1 if myDict[part] > 1: validPwd = False if validPwd: validNo += 1 print(validNo)
def postflight(self): ''' read the output and merge in back to the ident csv ''' potential_buggy_percolator_output = self.params['translations'][ 'percolator_out'] + '.psms' if os.path.exists(potential_buggy_percolator_output): print('WTF Percolator ?') print('Renaming: \n{percolator_out}.psms ->> {percolator_out}'. format(**self.params['translations'])) os.rename( self.params['translations']['output_file_incl_path'] + '.psms.psms', self.params['translations']['output_file_incl_path'] + '.psms') os.rename( self.params['translations']['decoy_output_file_incl_path'] + '.psms.psms', self.params['translations']['decoy_output_file_incl_path'] + '.psms') os.rename( self.params['translations']['output_file_incl_path'] + '.psms.peptides', self.params['translations']['output_file_incl_path'] + '.peptides') s2l = {'target': ddict(list), 'decoy': ddict(list)} for pkey, p_out in [('target', 'percolator_out'), ('decoy', 'percolator_decoy_out')]: percolator_output_dict_reader = csv.DictReader(open( self.params['translations'][p_out], 'r'), delimiter='\t') for line_dict in percolator_output_dict_reader: peptide = line_dict['peptide'].split('.')[1] psmid_pep_key = ( line_dict['PSMId'], peptide, ) if psmid_pep_key not in s2l[pkey].keys(): s2l[pkey][psmid_pep_key] = line_dict opened_file = open(self.params['translations']['csv_input_file'], 'r') csv_input = csv.DictReader(row for row in opened_file if not row.startswith('#')) if "PEP" not in csv_input.fieldnames and "q-value" not in csv_input.fieldnames: csv_input.fieldnames += ['PEP', 'q-value'] csv_kwargs = {} if sys.platform == 'win32': csv_kwargs['lineterminator'] = '\n' else: csv_kwargs['lineterminator'] = '\r\n' csv_output = csv.DictWriter( open(self.params['translations']['output_file_incl_path'], 'w'), csv_input.fieldnames, **csv_kwargs) csv_output.writeheader() for line_dict in csv_input: # check if the current line is a decoy or a target # so we know in which percolator output file we have to look for it. psm_type = "target" if line_dict['Is decoy'].upper() == 'TRUE': psm_type = "decoy" # if self.params['translations']['decoy_tag'] in line_dict['proteinacc_start_stop_pre_post_;']: # line_dict['Is decoy'] = "true" # psm_type = "decoy" if line_dict['Modifications'].strip() != '': seq_and_mods = '#'.join( [line_dict['Sequence'], line_dict['Modifications']]) else: seq_and_mods = line_dict['Sequence'] _psmid_pep_key = ( line_dict['Spectrum Title'], seq_and_mods, ) if _psmid_pep_key in s2l[psm_type].keys(): line_dict['PEP'] = s2l[psm_type][_psmid_pep_key][ 'posterior_error_prob'] line_dict['q-value'] = s2l[psm_type][_psmid_pep_key]['q-value'] # write all results including decoy into the full csv: csv_output.writerow(line_dict) else: print( 'Original PSM :{0} could not be found in percolator output file, most probably because PSM was filtered by percolator, (multiple peptides to one spectrum match)' .format(_psmid_pep_key))
def main(input_file=None, decoy_tag=None, output_file=None): ''' Converts xTandem.xml files into .csv We need to do this on our own, because mzidentml_lib reports wrong positions for modifications (and it is also not able to convert the piledriver.mzid into csv) It should be noted that - xtandem groups are not merged (since it is not the same as protein groups) - multiple domains (multiple occurence of a peptide in the same protein) are not reported ''' NEW_HEADERS = [ 'Raw data location', 'Spectrum ID', 'Spectrum Title', 'Retention Time (s)', 'rank', 'Calc m/z', 'Exp m/z', 'Charge', 'Sequence', 'Modifications', 'X\!Tandem:expect', 'X\!Tandem:hyperscore', 'proteinacc_start_stop_pre_post_;', 'Is decoy', ] PROTON = 1.00727646677 protein = None group = None group_counter = 0 protein_groups = [] csvOut = csv.DictWriter(open(output_file, 'w', newline=''), NEW_HEADERS) csvOut.writeheader() print("Converting XTandem XML into CSV: {0}".format(input_file)) tandemXML = iter( cElementTree.iterparse(input_file, events=(b'start', b'end'))) for pos, (event, element) in enumerate(tandemXML): if event == b'start': if element.tag.endswith('bioml'): raw_data_location = element.attrib['label'].split("'")[1] if element.tag.endswith('group'): if 'mh' in element.attrib.keys(): group_counter = 0 notes = ddict(set) notes_key = None group = {} # reset group group['Raw data location'] = raw_data_location group['Charge'] = element.attrib['z'] group['Exp m/z'] = (float(element.attrib['mh']) / float(group['Charge'])) + PROTON group['X\!Tandem:expect'] = element.attrib['expect'] protein_groups = [] domain_groups = [] if element.attrib['label'] == 'fragment ion mass spectrum': notes_key = 'spectrum' if element.attrib['label'] == 'input parameters': notes_key = 'parameters' group_counter += 1 elif element.tag.endswith('protein'): protein = {} # reset protein notes_key = 'protein' elif element.tag.endswith('domain'): if 'seq' in element.attrib.keys(): domain = {header: '' for header in NEW_HEADERS} # reset domain domain['Sequence'] = element.attrib['seq'] domain['X\!Tandem:hyperscore'] = element.attrib[ 'hyperscore'] domain['Calc m/z'] = (float(element.attrib['mh']) / float(group['Charge'])) + PROTON protein['Start'] = element.attrib['start'] protein['Stop'] = element.attrib['end'] protein['pre'] = element.attrib['pre'][-1] protein['post'] = element.attrib['post'][0] if protein['Start'] == '1': protein['pre'] = '-' if protein['post'] == ']': protein['post'] = '-' domain[ 'proteinacc_start_stop_pre_post_;'] = '{0}_{1}_{2}_{3}_{4};'.format( protein['Defline'], protein['Start'], protein['Stop'], protein['pre'], protein['post'], ) if protein['Defline'].startswith(decoy_tag): domain['Is decoy'] = True else: domain['Is decoy'] = False domain['rank'] = 1 elif element.tag.endswith('aa'): if 'modified' not in element.attrib.keys(): continue mod = '{0}:{1}'.format( element.attrib['modified'], int(element.attrib['at']) - int(protein['Start']) + 1) try: domain['Modifications'].append(mod) except: domain['Modifications'] = [mod] else: if element.tag.endswith('protein'): protein_groups.append(protein) if element.tag.endswith('domain'): for k, v in domain.items(): if k not in protein.keys(): protein[k] = v # domain_groups.append( domain ) elif element.tag.endswith('note'): if notes_key == 'spectrum': group['Spectrum Title'] = element.text.strip() elif notes_key == 'protein': protein['Defline'] = element.text.strip() elif element.tag.endswith('group'): group_counter -= 1 if group_counter == 0: dict2write = {} # proteins = [] for protein_group in protein_groups: for key in NEW_HEADERS: # if key == 'proteinacc_start_stop_pre_post_;': # proteins.append(protein_group[key]) # continue if key == 'Modifications' and protein_group[ key] != '': dict2write[key] = ';'.join(protein_group[key]) continue try: dict2write[key] = group[key] except: dict2write[key] = protein_group[key] # dict2write['proteinacc_start_stop_pre_post_;'] = ''.join(proteins) csvOut.writerow(dict2write) protein_groups = [] return
else: next_state = current_state + direction_deltas[action_indx] # Boundaries if next_state[0] < 0: next_state[0] = 0 if next_state[0] > 39: next_state[0] = 39 if next_state[1] < 0: next_state[1] = 0 if next_state[1] > 39: next_state[1] = 39 return next_state # Exploration sar_dict = ddict(list) set_of_episode_traces = [] for ep_n in range(exploration_episode_number): ### current_state = [0, 39, 1] iter_number = 1 episode_trace = ['start'] start_time = time.time() while reward(episode_trace, layout_dict) < 9 and iter_number < max_it_number: iter_number += 1 action = random.randint(0, 3) next_state_2d = list(take_action(current_state[0:-1], action)) next_state_label = layout[next_state_2d[0]][next_state_2d[1]] if next_state_label != 0: # ignoring the neutral label episode_trace.append(next_state_label) # ### SYNTH ### #
def __init__( self, path=None, noiseThreshold=0.0, extraAccessions=None, MS1_Precision=5e-6, MSn_Precision=20e-6, build_index_from_scratch=False, file_object=None, obo_version=None, ): # self.param contains user-specified parsing parameters self.param = dict() self.param['noiseThreshold'] = noiseThreshold self.param['MS1_Precision'] = MS1_Precision self.param['MSn_Precision'] = MSn_Precision self.param['accessions'] = {} # self.info contains information extracted from the mzML file self.info = dict() self.info['offsets'] = ddict() self.info['offsetList'] = [] self.info['referenceableParamGroupList'] = False self.info['spectrum_count'] = 0 self.info['chromatogram_count'] = 0 self.info['obo_version'] = obo_version self.info['encoding'] = None self.MS1_Precision = MS1_Precision self.elementList = [] # Default stuff # Can actually be either a spectrum _or_ a chromatogram; the Spectrum # class supports both self.spectrum = pymzml.spec.Spectrum( measuredPrecision=MS1_Precision, param=self.param, ) self.spectrum.clear() assert path is not None or file_object is not None, \ 'Must provide either a path or a file object to parse' self.info['fileObject'], self.info['seekable'] = self.__open_file( path, file_object) self.info['filename'] = path if self.info['seekable']: # Seekable files can use the index for random access self.seeker = self._build_index(build_index_from_scratch) self.iter = self.__init_iter() self.OT = self.__init_obo_translator(extraAccessions) return
def _parse_sequence_unimod_style(self, sequence): minPos = sequence.index("#") peptide = sequence[:minPos] addon = sequence[minPos + 1:] self.peptide = peptide if peptide != '': self.add_peptide(peptide) self['O'] += 1 self['H'] += 2 self.addon = addon unimods = self.addon.split(';') # pattern = self.regex_patterns[':pos'] pattern = re.compile(r''':(?P<pos>[0-9]*$)''') for unimod in unimods: if unimod == '': continue unimod = unimod.strip() if ':' not in unimod: sys.exit( 'This unimod: {0} requires positional information'.format( unimod)) for occ, match in enumerate(pattern.finditer(unimod)): try: unimodcomposition = self._unimod_parser.name2composition( unimod[:match.start()]) except: sys.exit( 'Cannot map unimod {0}. extracted position argument {1}' .format(unimod, match.start())) # if occ >= 1: position = int(match.group('pos')) if position in self.unimod_at_pos.keys(): print('{0} <<- Two unimods at the same position ? '.format( sequence)) raise Exception self.unimod_at_pos[position] = unimod[:match.start()] # match = re.search( position_re_pattern, unimod) # if match is not None: # end = match.start() # print( '>>>>', match) # else: # end = len( unimod ) # try: # unimodcomposition = self._unimod_parser.name2composition( # unimod[:end ] # ) # except: # print( # 'Unimod error:', unimod,'>>', unimod[:end], # re.search( position_re_pattern , unimod), # re.search( position_re_pattern , unimod).start() # ) # exit(1) # print( self , 'peptide only') # print( 'Unimod:', unimod, unimod[:end] , ) # Full addition # print( unimodcomposition , '<<<<<<') for k, v in unimodcomposition.items(): self[k] += v # storage position related modifications position = int(match.group('pos')) if position == 0: # E.g. Acetylation at pos 0 indicates N-Term # but has to be counted for position 1 in this class position = 1 if position not in self.composition_of_mod_at_pos.keys(): self.composition_of_mod_at_pos[position] = ddict(int) if position not in self.composition_at_pos.keys(): self.composition_at_pos[position] = ddict(int) for k, v in unimodcomposition.items(): self.composition_of_mod_at_pos[position][k] += v self.composition_at_pos[position][k] += v return
def __init__(self, responses, vectors): self.type_vectors = ddict(int) # FastText vector file self.res_nvec = {} # Responses-Normalised vectors self.load_vectors(vectors) self.normalize_responses(responses)
def main(): ''' Example script to do a fragment mass tolerance parameter sweep. usage: ./bsa_fragment_mass_tolerance_example.py If the fragment mass tolerance becomes to small, ver few peptides are found. With this small sweep the actual min accuracy of a mass spectrometer can be estimated. ''' fragment_mass_tolerance_list = [ 0.02, 0.04, 0.06, 0.08, 0.1, 0.2, 0.3, 0.4, 0.5, ] engine_list = ['xtandem_vengeance'] R = ursgal.UController( profile='LTQ XL low res', params={ 'database': os.path.join(os.pardir, 'example_data', 'BSA.fasta'), 'modifications': [ 'M,opt,any,Oxidation', # Met oxidation 'C,fix,any,Carbamidomethyl', # Carbamidomethylation '*,opt,Prot-N-term,Acetyl' # N-Acteylation ], }) mzML_file = os.path.join(os.pardir, 'example_data', 'BSA_fragment_mass_tolerance_example', 'BSA1.mzML') if os.path.exists(mzML_file) is False: R.params[ 'http_url'] = 'http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw' R.params['http_output_folder'] = os.path.dirname(mzML_file) R.fetch_file(engine='get_http_files_1_0_0') try: shutil.move('{0}?format=raw'.format(mzML_file), mzML_file) except: shutil.move('{0}format=raw'.format(mzML_file), mzML_file) # Convert mzML to MGF outside the loop, so this step is not repeated in the loop mgf_file = R.convert_to_mgf_and_update_rt_lookup(input_file=mzML_file) for engine in engine_list: for fragment_mass_tolerance in fragment_mass_tolerance_list: R.params['frag_mass_tolerance'] = fragment_mass_tolerance R.params['prefix'] = '{0}_fragment_mass_tolerance_'.format( fragment_mass_tolerance) unified_search_result_file = R.search( input_file=mgf_file, engine=engine, force=False, ) collector = ddict(set) for csv_path in glob.glob('{0}/*/*unified.csv'.format( os.path.dirname(mzML_file))): for line_dict in csv.DictReader(open(csv_path, 'r')): collector[csv_path].add(line_dict['Sequence']) for csv_path, peptide_set in sorted(collector.items()): file_name = os.path.basename(csv_path) tolerance = file_name.split('_')[0] print( 'Search with {0: <4} Da fragment mass tolerance found {1: >2} peptides' .format(tolerance, len(peptide_set))) return
#==============================================# # Created By: Svess#8004 # # Last Modification: 2021-03-03 10:33 UTC+0 # #==============================================# # I used this to get the list of all different kakera reaction emotes so I could go through them and see if they were used by the bot at some point from collections import defaultdict as ddict import json files = ["flood-7.json"] Mudae = "432610292342587392" Mudamaid2 = "488711695640821760" emotes = ddict(str) dct = ddict(int) for File in files: searchfile = open(File, ) data = json.load(searchfile) for message in data['messages']: if message["author"]["id"] == Mudamaid2: if message["reactions"] != []: for i in message["reactions"]: dct[i["emoji"]["id"]] += 1 emotes[i["emoji"]["id"]] = i["emoji"]["name"] searchfile.close()
from sys import stdin from collections import defaultdict as ddict dic = True mapp = ddict(lambda: 'eh') for i in stdin: if dic: j = i.split() if len(j) == 0: dic = False continue mapp[j[1]] = j[0] else: print(mapp[i.strip()])
################################################ # DISCLAIMER # # This code is not good for 1 major reason # # We are unable to see if the reactions # # are on messages from the Mudae BOTs # # So any reaction on any message is counted # # and therefore this code will not # # return a close to correct resault # ################################################ # Imports from collections import defaultdict as ddict # Initialising a default dict to store the reactions dct = ddict(int) # List of files that the program will read through # Should be .txt files # We recommend compiling your data using DiscordChatExporter files = [ # Dataset 1 # 2021-03-15 22:00 UTC - 2021-03-31 07:26 UTC # After Light kakera was added #"resources-1/flood-1.txt", #"resources-1/flood-2.txt", #"resources-1/flood-3.txt", #"resources-1/flood-4.txt", #"resources-1/flood-5.txt",
def analyze(folder): ''' Parses the result files form search and write a result .csv file which contains the data to plot figure 2. ''' R = ursgal.UController( profile='QExactive+', params=GENERAL_PARAMS ) csv_collector = {} ve = 'qvality_2_02' sample_regex_pattern = 'sample\d_R0\d' sample_2_x_pos_and_mq_offset = {} sample_offset_combos = [] all_tested_offsets = [str(n) for n in range(-20, 21, 2)] for pos, (mq_ppm_off, mzML_file) in enumerate(MQ_OFFSET_TO_FILENAME): _sample = re.search(sample_regex_pattern, mzML_file).group() sample_2_x_pos_and_mq_offset[_sample] = (pos, mq_ppm_off) for theo_offset in all_tested_offsets: sample_offset_combos.append((_sample, theo_offset)) for csv_path in glob.glob(os.path.join('{0}'.format(folder), '*', '*_unified.csv')): dirname = os.path.dirname(csv_path) sample = re.search(sample_regex_pattern, csv_path).group() splitted_basename = os.path.basename(csv_path).split('_') offset = splitted_basename[2] precursor_ion_tolerance = splitted_basename[4] frag_ion_tolerance = splitted_basename[6] prefix = '_'.join(splitted_basename[:7]) R.params['machine_offset_in_ppm'] = offset R.params['precursor_mass_tolerance_minus'] = precursor_ion_tolerance R.params['precursor_mass_tolerance_plus'] = precursor_ion_tolerance R.params['frag_mass_tolerance'] = frag_ion_tolerance R.params['prefix'] = prefix validated_path = csv_path.replace( '_unified.csv', '_{0}_validated.csv'.format(ve) ) if os.path.exists(validated_path): csv_path = validated_path else: try: csv_path = R.validate( input_file=csv_path, engine=ve ) except: continue pit_fit = (precursor_ion_tolerance, frag_ion_tolerance) if pit_fit not in csv_collector.keys(): csv_collector[pit_fit] = ddict(set) csv_key = (sample, offset) print('Reading file: {0}'.format(csv_path)) for line_dict in csv.DictReader(open(csv_path, 'r')): if line_dict['Is decoy'] == 'true': continue if float(line_dict['PEP']) <= 0.01: csv_collector[pit_fit][csv_key].add( '{0}{1}'.format( line_dict['Sequence'], line_dict['Modifications'] ) ) fieldnames = [ 'Sample', 'pos', 'MQ_offset', 'tested_ppm_offset', 'peptide_count' ] outfile_name_format_string = 'bruderer_data_ppm_sweep_precursor_mass_tolerance_{0}_fragment_mass_tolerance_{1}.csv' for pit_fit in csv_collector.keys(): with open(outfile_name_format_string.format(*pit_fit), 'w') as io: csv_writer = csv.DictWriter(io, fieldnames) csv_writer.writeheader() # write missing values for sample_offset in sample_offset_combos: sample, ppm_offset = sample_offset if sample_offset not in csv_collector[pit_fit].keys(): dict_2_write = { 'Sample': sample, 'pos': sample_2_x_pos_and_mq_offset[sample][0], 'MQ_offset': '', 'tested_ppm_offset': ppm_offset, 'peptide_count': 0, } csv_writer.writerow(dict_2_write) for (sample, ppm_offset), peptide_set in csv_collector[pit_fit].items(): dict_2_write = { 'Sample': sample, 'pos': sample_2_x_pos_and_mq_offset[sample][0], 'MQ_offset': sample_2_x_pos_and_mq_offset[sample][1] * -1, 'tested_ppm_offset': ppm_offset, 'peptide_count': len(peptide_set), } csv_writer.writerow(dict_2_write) return
def create_year2id(self, triple_time): year2id = dict() freq = ddict(int) count = 0 year_list = [] for k, v in triple_time.items(): try: start = v[0].split('-')[0] end = v[1].split('-')[0] except: pdb.set_trace() if start.find('#') == -1 and len(start) == 4: year_list.append(int(start)) if end.find('#') == -1 and len(end) == 4: year_list.append(int(end)) # for k,v in entity_time.items(): # start = v[0].split('-')[0] # end = v[1].split('-')[0] # if start.find('#') == -1 and len(start) == 4: year_list.append(int(start)) # if end.find('#') == -1 and len(end) ==4: year_list.append(int(end)) # # if int(start) > int(end): # # pdb.set_trace() year_list.sort() for year in year_list: freq[year] = freq[year] + 1 year_class = [] count = 0 for key in sorted(freq.keys()): count += freq[key] if count > 300: year_class.append(key) count = 0 prev_year = 0 i = 0 for i, yr in enumerate(year_class): year2id[(prev_year, yr)] = i prev_year = yr + 1 year2id[(prev_year, max(year_list))] = i + 1 self.year_list = year_list # for k,v in entity_time.items(): # if v[0] == '####-##-##' or v[1] == '####-##-##': # continue # if len(v[0].split('-')[0])!=4 or len(v[1].split('-')[0])!=4: # continue # start = v[0].split('-')[0] # end = v[1].split('-')[0] # for start in start_list: # if start not in start_year2id: # start_year2id[start] = count_start # count_start+=1 # for end in end_list: # if end not in end_year2id: # end_year2id[end] = count_end # count_end+=1 return year2id
import pickle import json from collections import defaultdict as ddict from collections import Counter import os.path as osp import numpy.random as npr import numpy as np from pprint import pprint CityDic = ddict(set) target = "business_id" sfile = "CityRest.pkl" lfile = "business.json" if not osp.isfile(sfile): for idx, oneobs in enumerate(open(lfile, "r")): print(f"Current index is {idx}") oneobs = json.loads(oneobs) CityDic[oneobs["city"]].add(oneobs[target]) if idx == 1000: pass #break with open(sfile, "wb") as f: pickle.dump(CityDic, f) with open(sfile, "rb") as f: CityDic = pickle.load(f) CityC = Counter() for key, dat in CityDic.items():
def load_data(dataset_str, args): names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] if 'nell' in dataset_str: data_dict = pickle.load(open('./data/{}_data.pkl'.format(dataset_str), 'rb'), encoding='latin1') x, y, tx, ty, allx, ally, graph = data_dict['x'], data_dict['y'], data_dict['tx'], data_dict['ty'], data_dict['allx'], data_dict['ally'], data_dict['graph'] index = list(range(allx.shape[0])) + data_dict['test.index'] remap = {x: x for x in range(allx.shape[0])} remap.update({i+allx.shape[0]: x for i, x in enumerate(data_dict['test.index'])}) remap_inv = {v: k for k, v in remap.items()} graph_new = ddict(list) for key, val in graph.items(): if key not in remap_inv: continue graph_new[remap_inv[key]] = [remap_inv[v] for v in val if v in remap_inv] graph = graph_new test_idx_reorder = [remap_inv[x] for x in data_dict['test.index']] else: for i in range(len(names)): with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pickle.load(f, encoding='latin1')) else: objects.append(pickle.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range-min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range-min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y)+500) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
import json, urllib, boto3, botocore, base64, time, traceback, random, string from collections import defaultdict as ddict ROUTES = {} EXTRA_AUTH = ddict(list) BUCKET = 'caraza-harter-cs301' ADMIN_EMAIL = '*****@*****.**' INSTRUCTOR_EMAILS = ['*****@*****.**', '*****@*****.**'] GRADER_EMAILS = [ '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', ] s3_cache = None # client def s3(): # cache S3 client global s3_cache if s3_cache == None: s3_cache = boto3.client('s3') return s3_cache
def analyze(collector): """ Simle analysis script for the cascade search, counting the number of identified peptides (combination of peptide sequence and modifications) and PSMs (additionally include the spectrum ID) """ mod_list = ["Oxidation", "Deamidated", "Methyl", "Acetyl", "Phospho"] fieldnames = ([ "approach", "count_type", "validation_engine", "unmodified", "multimodified" ] + mod_list + ["total"]) csv_writer = csv.DictWriter(open("cascade_results.csv", "w"), fieldnames) csv_writer.writeheader() uc = ursgal.UController() uc.params["validation_score_field"] = "PEP" uc.params["bigger_scores_better"] = False # Count the number of identified peptides and PSMs for the different modifications # Spectra with multiple PSMs are sanitized, i.e. only the PSM with best PEP score is counted # and only if the best hit has a PEP that is at least two orders of # magnitude smaller than the others for validation_engine, result_file in collector.items(): counter_dict = {"psm": ddict(set), "pep": ddict(set)} grouped_psms = uc._group_psms(result_file, validation_score_field="PEP", bigger_scores_better=False) for spec_title, grouped_psm_list in grouped_psms.items(): best_score, best_line_dict = grouped_psm_list[0] if len(grouped_psm_list) > 1: second_best_score, second_best_line_dict = grouped_psm_list[1] best_peptide_and_mod = (best_line_dict["Sequence"] + best_line_dict["Modifications"]) second_best_peptide_and_mod = ( second_best_line_dict["Sequence"] + second_best_line_dict["Modifications"]) if best_peptide_and_mod == second_best_peptide_and_mod: line_dict = best_line_dict elif best_line_dict["Sequence"] == second_best_line_dict[ "Sequence"]: if best_score == second_best_score: line_dict = best_line_dict else: if (-1 * math.log10(best_score)) - ( -1 * math.log10(second_best_score)) >= 2: line_dict = best_line_dict else: continue else: if (-1 * math.log10(best_score)) - ( -1 * math.log10(second_best_score)) >= 2: line_dict = best_line_dict else: continue else: line_dict = best_line_dict count = 0 for mod in mod_list: if mod in line_dict["Modifications"]: count += 1 key_2_add = "" if count == 0: key_2_add = "unmodified" elif count >= 2: key_2_add = "multimodified" elif count == 1: for mod in mod_list: if mod in line_dict["Modifications"]: key_2_add = mod break # for peptide identification comparison counter_dict["pep"][key_2_add].add(line_dict["Sequence"] + line_dict["Modifications"]) # for PSM comparison counter_dict["psm"][key_2_add].add(line_dict["Spectrum Title"] + line_dict["Sequence"] + line_dict["Modifications"]) for counter_key, count_dict in counter_dict.items(): dict_2_write = { "approach": "cascade", "count_type": counter_key, "validation_engine": validation_engine, } total_number = 0 for key, obj_set in count_dict.items(): dict_2_write[key] = len(obj_set) total_number += len(obj_set) dict_2_write["total"] = total_number csv_writer.writerow(dict_2_write) return
def __init__(self, filename=None, run=None, overwrite=False): cElementTree.register_namespace("", "http://psi.hupo.org/ms/mzml") self.filename = filename self.lookup = {} self.newTree = None self.TreeBuilder = cElementTree.TreeBuilder() self.run = run self.info = {'counters': ddict(int)} if self.run.info['filename'].endswith('.gz'): import gzip import codecs io = codecs.getreader("utf-8")(gzip.open( self.run.info['filename'])) else: io = open(self.run.info['filename'], 'r') #read the rest as original file input_xml_string = '' pymzml_tag_written = False #open again to read as text! for line in open(self.run.info['filename'], 'r').readlines(): if 'indexedmzML' in line: # writing of indexed mzML is not possible at the moment continue if 'run' in line: # the run is appended from the original parser to avoid messing # with the new xml tree, we break before the run data starts break input_xml_string += line if 'softwareList' in line and pymzml_tag_written is False: addon = cElementTree.Element('software', { 'id': 'pymzML', 'version': "0.7.6" }) cElementTree.SubElement( addon, 'cvParam', { 'accession': 'MS:1000531', 'cvRef': 'MS', 'name': 'pymzML Writer', 'version': '0.7.6', }) new_line = cElementTree.tostring(addon, encoding='utf-8') input_xml_string += new_line pymzml_tag_written = True input_xml_string += '</mzML>\n' self.newTree = cElementTree.fromstring(input_xml_string) for event, element in cElementTree.iterparse(io, events=(b'start', b'end')): if event == b'start': if element.tag.endswith('}run'): self.lookup['run'] = cElementTree.Element( element.tag, element.attrib) if element.tag.endswith('}spectrumList'): self.lookup['spectrumList'] = \ cElementTree.Element(element.tag, element.attrib) self.lookup['spectrumIndeces'] = \ cElementTree.Element('index', {'name': 'spectrum'}) break return
def get_training_data(dbfile, V_dbfile=None): """ This function gets training data from a database returns training and test data Args: dbfile: the base SQLite db file with the training data V_dbfile: an optional held out validation SQLite db file """ db = sqlite3.connect(dbfile) cursor = db.cursor() if V_dbfile: V_db = sqlite3.connect(V_dbfile) V_cursor = V_db.cursor() split_method = args.split_method query = "" if split_method == 'entity': cursor.execute(""" select raw, entity_id, label from string join entity on entity_id = entity.id join uniquestring on uniquestring_id = uniquestring.id """) data = list(cursor.fetchall())[:args.max_samples] entity_strings = ddict(list) for raw, eid, label in data: if len(entity_strings[(eid, label)]) < 100: entity_strings[(eid, label)].append(raw) entity_strings = entity_strings.items() random.shuffle(entity_strings) datalen = len(entity_strings) splitidx = int(args.split_ratio * datalen) train_strings, test_strings = entity_strings[: splitidx], entity_strings[ splitidx:] train_X = [] train_y = [] test_X = [] test_y = [] for (eid, label), strings in train_strings: for string in strings: train_X.append(string) train_y.append(int(label)) for (eid, label), strings in test_strings: for string in strings: test_X.append(string) test_y.append(int(label)) return train_X, test_X, train_y, test_y elif split_method == 'unique': def get_unique_data(target_cursor, max_samples, skipset=None): target_cursor.execute(""" select raw, p_malware from uniquestring """) strings = [] labels = [] rows = list(target_cursor.fetchall()) random.shuffle(rows) for string, p_malware in rows: string = string.lower() if skipset and string in skipset: print "SKIPPING:", string continue if p_malware == 1.0: labels.append(1) elif p_malware < 1.0: labels.append(0) strings.append(string) if len(strings) == max_samples: break return strings, labels strings, labels = get_unique_data(cursor, args.max_samples) val_skipset = set(strings) if V_dbfile: V_vec, V_labels = get_unique_data(V_cursor, args.max_val_samples, val_skipset) datalen = len(labels) splitidx = int(args.split_ratio * datalen) train_strings, test_strings = strings[:splitidx], strings[splitidx:] train_labels, test_labels = labels[:splitidx], labels[splitidx:] if V_dbfile: return train_strings, test_strings, train_labels, test_labels, V_vec, V_labels else: return train_strings, test_strings, train_labels, test_labels
def get_paper_authorlist(data, need_paper_set): paper_authorlist = ddict(set) for (paperid, authorid, name, affi) in data.paperauthor_tuples: paper_authorlist[paperid].add(authorid) return paper_authorlist
def analyze(collector): ''' Simle analysis script for the cascade search, counting the number of identified peptides (combination of peptide sequence and modifications) and PSMs (additionally include the spectrum ID) ''' mod_list = ['Oxidation', 'Deamidated', 'Methyl', 'Acetyl', 'Phospho'] fieldnames = [ 'approach', 'count_type', 'validation_engine', 'unmodified', 'multimodified' ] + mod_list + ['total'] csv_writer = csv.DictWriter(open('cascade_results.csv', 'w'), fieldnames) csv_writer.writeheader() uc = ursgal.UController() uc.params['validation_score_field'] = 'PEP' uc.params['bigger_scores_better'] = False # Count the number of identified peptides and PSMs for the different modifications # Spectra with multiple PSMs are sanitized, i.e. only the PSM with best PEP score is counted # and only if the best hit has a PEP that is at least two orders of magnitude smaller than the others for validation_engine, result_file in collector.items(): counter_dict = {'psm': ddict(set), 'pep': ddict(set)} grouped_psms = uc._group_psms(result_file, validation_score_field='PEP', bigger_scores_better=False) for spec_title, grouped_psm_list in grouped_psms.items(): best_score, best_line_dict = grouped_psm_list[0] if len(grouped_psm_list) > 1: second_best_score, second_best_line_dict = grouped_psm_list[1] best_peptide_and_mod = best_line_dict[ 'Sequence'] + best_line_dict['Modifications'] second_best_peptide_and_mod = second_best_line_dict[ 'Sequence'] + second_best_line_dict['Modifications'] if best_peptide_and_mod == second_best_peptide_and_mod: line_dict = best_line_dict elif best_line_dict['Sequence'] == second_best_line_dict[ 'Sequence']: if best_score == second_best_score: line_dict = best_line_dict else: if (-1 * math.log10(best_score)) - ( -1 * math.log10(second_best_score)) >= 2: line_dict = best_line_dict else: continue else: if (-1 * math.log10(best_score)) - ( -1 * math.log10(second_best_score)) >= 2: line_dict = best_line_dict else: continue else: line_dict = best_line_dict count = 0 for mod in mod_list: if mod in line_dict['Modifications']: count += 1 key_2_add = '' if count == 0: key_2_add = 'unmodified' elif count >= 2: key_2_add = 'multimodified' elif count == 1: for mod in mod_list: if mod in line_dict['Modifications']: key_2_add = mod break # for peptide identification comparison counter_dict['pep'][key_2_add].add(line_dict['Sequence'] + line_dict['Modifications']) # for PSM comparison counter_dict['psm'][key_2_add].add(line_dict['Spectrum Title'] + line_dict['Sequence'] + line_dict['Modifications']) for counter_key, count_dict in counter_dict.items(): dict_2_write = { 'approach': 'cascade', 'count_type': counter_key, 'validation_engine': validation_engine } total_number = 0 for key, obj_set in count_dict.items(): dict_2_write[key] = len(obj_set) total_number += len(obj_set) dict_2_write['total'] = total_number csv_writer.writerow(dict_2_write) return
def main(mzml=None): """ Example script fort visualizing the m/z and intensity error, which is the basis for the scoring of the matches in pyQms. Use spectrum 1165 of the BSA1.mzML example file. A subrange of the spectrum from m/z 400 to 500 is used. Usage: ./visualize_scoring_information.py Note: This example does not require a reader to access MS spectra, since a simnple peak list is used. """ peak_list = [ (404.2492407565097, 2652.905029296875), (405.3003310237508, 4831.56103515625), (408.8403673369115, 23153.7109375), (409.17476109421705, 10182.2822265625), (409.5098740355617, 4770.97412109375), (411.17196124490727, 3454.364013671875), (413.26627826402705, 6861.84912109375), (419.3157903165357, 90201.5625), (420.2440507067882, 11098.4716796875), (420.31917273788645, 22288.9140625), (420.73825281590496, 8159.7099609375), (421.2406187369968, 3768.656494140625), (427.3787652898548, 5680.43212890625), (433.3316647490907, 8430.30859375), (434.705984428002, 25924.38671875), (435.2080179219357, 11041.2060546875), (443.6708762397708, 4081.282470703125), (443.69049198141124, 5107.13330078125), (443.6974813419733, 9135.3125), (443.7112735313511, 2517650.0), (443.7282222289076, 5571.26025390625), (443.7379762316008, 5227.4033203125), (444.1998579474954, 3021.341796875), (444.21248374593875, 1156173.75), (444.71384916266277, 336326.96875), (445.21533524843596, 58547.0703125), (445.71700965093, 4182.04345703125), (446.1200302053469, 93216.3359375), (447.09963627699824, 3806.537109375), (447.1169242266495, 59846.37109375), (447.3464079857604, 13170.9541015625), (448.11566395552086, 9294.5107421875), (448.3500303628631, 3213.052490234375), (452.1123280000919, 5092.0869140625), (461.1934526664677, 4022.537353515625), (462.1463969367603, 99732.5), (463.14561508666384, 24247.015625), (464.1433022096936, 20417.041015625), (465.1421080732791, 3222.4052734375), (470.1669593722212, 8621.81640625), (475.23989190282134, 3369.073974609375), (493.27465300375036, 2725.885986328125), (496.0077303201583, 8604.0830078125), ] print("{0:-^100}".format("Library generation")) lib = pyqms.IsotopologueLibrary( molecules=["DDSPDLPK"], charges=[2], metabolic_labels=None, fixed_labels=None, verbose=True, ) print("{0:-^100}".format("Library generation")) results = lib.match_all( mz_i_list=peak_list, file_name="BSA_test", spec_id=1165, spec_rt=29.10, results=None, ) for key, i, entry in results.extract_results(): p = pymzml.plot.Factory() label_mz_error = [] label_i_error = [] measured_peaks = [] matched_peaks = [] peak_info = ddict(list) # pprint.pprint(entry.peaks) for ( measured_mz, measured_intensity, relative_i, calculated_mz, calculated_intensity, ) in entry.peaks: if measured_mz is not None: measured_peaks.append((measured_mz, measured_intensity)) matched_peaks.append( (calculated_mz, calculated_intensity * entry.scaling_factor)) mz_error = (measured_mz - calculated_mz) / (measured_mz * 1e-6) label_mz_error.append( (calculated_mz, "{0:5.3f} ppm m/z error".format(mz_error))) scaled_intensity = calculated_intensity * entry.scaling_factor rel_i_error = (abs(measured_intensity - scaled_intensity) / scaled_intensity) peak_info["measured peaks"].append(measured_mz) peak_info["theoretical peaks"].append(calculated_mz) peak_info["relative intensity"].append(relative_i) peak_info["scaled matched peaks"].append(calculated_intensity * entry.scaling_factor) peak_info["mz error"].append(mz_error) peak_info["i error"].append(rel_i_error) if rel_i_error > 1: rel_i_error = 1 label_i_error.append( (calculated_mz, "{0:5.3f} rel. intensity error".format(rel_i_error))) mz_only = [n[0] for n in measured_peaks] mz_range = [min(mz_only) - 1, max(mz_only) + 1] peptide = results.lookup["formula to molecule"][key.formula][0] p.newPlot( header= "Formula: {0}; Peptide: {1}; Charge: {2}\n Amount: {3:1.3f}; Score: {4:1.3f}" .format(key.formula, peptide, key.charge, entry.scaling_factor, entry.score), mzRange=mz_range, ) p.add(measured_peaks, color=(0, 0, 0), style="sticks") p.add(matched_peaks, color=(0, 200, 0), style="triangles") p.add(label_mz_error, color=(255, 0, 0), style="label_x") p.add(label_i_error, color=(255, 0, 0), style="label_x") plot_name = os.path.join( os.pardir, "data", "Score_visualization_Peptide_{1}_Charge_{2}.xhtml".format( key.file_name, peptide, key.charge), ) p.save(filename=plot_name, mzRange=mz_range) print("Plotted file {0}".format(plot_name)) # print(entry) print("Match info") for key, value_list in sorted(peak_info.items()): print(key) print("[{0}]".format(",".join([str(n) for n in value_list]))) print() return
def createGraphProxy(ssagraph): assert (not ssagraph.procs) # should have already been inlined nodes = [ BlockProxy(b.key, itertools.count(), block=b) for b in ssagraph.blocks ] allnodes = nodes[:] # will also contain indirected nodes entryNode = None intypes = ddict(set) for n in nodes: invars = [phi.rval for phi in n.block.phis] for b, t in n.block.jump.getSuccessorPairs(): intypes[b.key].add(t) if n.bkey == ssagraph.entryKey: assert ( not entryNode and not invars ) # shouldn't have more than one entryBlock and entryBlock shouldn't have phis entryNode = n invars = ssagraph.inputArgs # store them in the node so we don't have to keep track seperately invars = [ x for x in invars if x is not None ] # will have None placeholders for Long and Double arguments n.invars = invars lookup = {} for n in nodes: assert len( intypes[n.bkey] ) != 2 # should have been handled by graph.splitDualInedges() if False in intypes[n.bkey]: lookup[n.bkey, False] = n if True in intypes[n.bkey]: lookup[n.bkey, True] = n assert unique(lookup.values()) for n in nodes: n.blockdict = lookup block = n.block for (block2, t) in block.jump.getSuccessorPairs(): out = [phi.get((block, t)) for phi in block2.phis] n2 = lookup[block2.key, t] n.outvars[n2] = out n.successors.append(n2) n2.predecessors.append(n) # sanity check for n in allnodes: assert (n.block is not None) == (n.num == 0) assert (n is entryNode) == (len(n.predecessors) == 0) assert unique(n.predecessors) assert unique(n.successors) for pn in n.predecessors: assert n in pn.successors assert set(n.outvars) == set(n.successors) for sn in n.successors: assert n in sn.predecessors assert len(n.outvars[sn]) == len(sn.invars) return entryNode, allnodes
def preflight(self): ''' Formatting the command line and writing the param input file via self.params Returns: dict: self.params ''' self.param_file_name = os.path.join(self.params['output_dir_path'], 'msfragger.params') # further prepare and translate params # pprint.pprint(self.params['translations']['_grouped_by_translated_key']) # pprint.pprint(self.params) # exit() self.params_to_write = { 'output_file_extension': 'tsv', # tsv or pepXML we fix it... 'output_format': 'tsv', # pepXML or tsv 'digest_mass_range': '{0} {1}'.format( self.params['translations']['_grouped_by_translated_key'] ['precursor_min_mass']['precursor_min_mass'], self.params['translations']['_grouped_by_translated_key'] ['precursor_max_mass']['precursor_max_mass']) } write_exclusion_list = [ 'precursor_min_mass', 'precursor_max_mass', 'precursor_min_charge', 'precursor_max_charge', 'label', '-Xmx', 'header_translations', 'validation_score_field' ] additional_15N_modifications = [] if self.params['translations']['_grouped_by_translated_key']['label'][ 'label'] == '15N': self.print_info( 'Search with label=15N may still be errorprone. Evaluate with care!', caller='WARNING') for aminoacid, N15_Diff in ursgal.ukb.DICT_15N_DIFF.items(): existing = False for mod_dict in self.params['mods']['fix']: if aminoacid == mod_dict['aa']: mod_dict['mass'] += N15_Diff mod_dict['name'] += '_15N_{0}'.format(aminoacid) existing = True if existing == True: continue else: mod_key = 'add_{0}_{1}'.format( aminoacid, ursgal.chemical_composition_kb.aa_names[aminoacid]) self.params_to_write[mod_key] = N15_Diff for msfragger_param_name in self.params['translations'][ '_grouped_by_translated_key'].keys(): for ursgal_param_name, param_value in self.params['translations'][ '_grouped_by_translated_key'][msfragger_param_name].items( ): if msfragger_param_name in write_exclusion_list: continue elif msfragger_param_name == 'enzyme': ''' search_enzyme_name = Trypsin search_enzyme_cutafter = KR search_enzyme_butnotafter = P ''' aa_site, term, inhibitor = param_value.split(';') self.params_to_write['search_enzyme_name'] = self.params[ 'enzyme'] self.params_to_write['search_enzyme_cutafter'] = aa_site self.params_to_write[ 'search_enzyme_butnotafter'] = inhibitor elif msfragger_param_name == 'num_enzyme_termini': # num_enzyme_termini = 2 # 2 for enzymatic, 1 for # semi-enzymatic, 0 for nonspecific digestion if self.params['translations'][ '_grouped_by_translated_key']['enzyme'][ 'enzyme'] == 'nonspecific': self.params_to_write[msfragger_param_name] = 0 else: self.params_to_write[ msfragger_param_name] = param_value elif msfragger_param_name == 'clear_mz_range': min_mz, max_mz = param_value self.params_to_write[ msfragger_param_name] = '{0} {1}'.format( min_mz, max_mz) elif msfragger_param_name == 'modifications': ''' #maximum of 7 mods - amino acid codes, * for any amino acid, [ and ] specifies protein termini, n and c specifies peptide termini variable_mod_01 = 15.9949 M variable_mod_02 = 42.0106 [* #variable_mod_03 = 79.96633 STY #variable_mod_03 = -17.0265 nQnC #variable_mod_04 = -18.0106 nE ''' # print(self.params['translations']['_grouped_by_translated_key'][msfragger_param_name]) # pprint.pprint(self.params[ 'mods' ]) # exit() mass_to_mod_aa = ddict(list) for mod_dict in self.params['mods']['opt']: ''' {'_id': 0, 'aa': '*', 'composition': {'C': 2, 'H': 2, 'O': 1}, 'id': '1', 'mass': 42.010565, 'name': 'Acetyl', 'org': '*,opt,Prot-N-term,Acetyl', 'pos': 'Prot-N-term', 'unimod': True}, ''' aa_to_append = mod_dict['aa'] pos_modifier = None if mod_dict['pos'] == 'Prot-N-term': pos_modifier = '[' elif mod_dict['pos'] == 'Prot-C-term': pos_modifier = ']' elif mod_dict['pos'] == 'N-term': pos_modifier = 'n' elif mod_dict['pos'] == 'C-term': pos_modifier = 'c' elif mod_dict['pos'] == 'any': pass else: print(''' Unknown positional argument for given modification: {0} MSFragger cannot deal with this, please use one of the follwing: any, Prot-N-term, Prot-C-term, N-term, C-term '''.format(mod_dict['org'])) sys.exit(1) if pos_modifier is not None: aa_to_append = '{0}{1}'.format( pos_modifier, aa_to_append) mass_to_mod_aa[mod_dict['mass']].append(aa_to_append) for pos, (mass, aa_list) in enumerate(mass_to_mod_aa.items()): self.params_to_write['variable_mod_0{0}'.format( pos + 1)] = '{0} {1}'.format( mass, ''.join(aa_list)) for mod_dict in self.params['mods']['fix']: ''' add_C_cysteine = 57.021464 # added to C - avg. 103.1429, mono. 103.00918 ''' if mod_dict['pos'] == 'Prot-N-term': mod_key = 'add_Nterm_protein' elif mod_dict['pos'] == 'Prot-C-term': mod_key = 'add_Cterm_protein' elif mod_dict['pos'] == 'N-term': mod_key = 'add_Nterm_peptide' elif mod_dict['pos'] == 'C-term': mod_key = 'add_Cterm_peptide' else: mod_key = 'add_{0}_{1}'.format( mod_dict['aa'], ursgal.chemical_composition_kb.aa_names[ mod_dict['aa']]) self.params_to_write[mod_key] = mod_dict['mass'] elif msfragger_param_name == 'override_charge': self.params_to_write[msfragger_param_name] = param_value if param_value == 1: self.params_to_write[ 'precursor_charge'] = '{0} {1}'.format( self.params['translations'] ['_grouped_by_translated_key'] ['precursor_min_charge'] ['precursor_min_charge'], self.params['translations'] ['_grouped_by_translated_key'] ['precursor_max_charge'] ['precursor_max_charge']) else: self.params_to_write[msfragger_param_name] = param_value self.write_params_file() self.input_file = os.path.join(self.params['input_dir_path'], self.params['input_file']) if self.input_file.lower().endswith('.mzml') or \ self.input_file.lower().endswith('.mzml.gz') or \ self.input_file.lower().endswith('.mgf'): self.params['translations']['mzml_input_file'] = self.input_file # elif self.input_file.lower().endswith('.mgf'): # self.params['translations']['mzml_input_file'] = \ # self.meta_unodes['ucontroller'].get_mzml_that_corresponds_to_mgf( self.input_file ) # self.print_info( # 'MSFragger can only read Proteowizard MGF input files,' # 'the corresponding mzML file {0} will be used instead.'.format( # os.path.abspath(self.params['translations']['mzml_input_file']) # ), # caller = "INFO" # ) else: raise Exception( 'MSFragger input spectrum file must be in mzML or MGF format!') # pprint.pprint(self.params['translations']) # exit() self.params['command_list'] = [ 'java', '-Xmx{0}'.format(self.params['translations'] ['_grouped_by_translated_key']['-Xmx']['-xmx']), '-jar', self.exe, self.param_file_name, self.params['translations']['mzml_input_file'] ] self.params['translations']['output_file_incl_path'] = os.path.join( self.params['output_dir_path'], self.params['output_file']) return self.params
def load_data(self): triple_set = [] with open(self.p.triple2id, 'r') as filein: for line in filein: tup = (int(line.split()[0].strip()), int(line.split()[1].strip()), int(line.split()[2].strip())) triple_set.append(tup) triple_set = set(triple_set) train_triples = [] self.start_time, self.end_time, self.num_class = ddict(dict), ddict( dict), ddict(dict) triple_time, entity_time = dict(), dict() self.inp_idx, self.start_idx, self.end_idx, self.labels = ddict( list), ddict(list), ddict(list), ddict(list) max_ent, max_rel, count = 0, 0, 0 with open(self.p.dataset, 'r') as filein: for line in filein: train_triples.append( [int(x.strip()) for x in line.split()[0:3]]) triple_time[count] = [ x.split('-')[0] for x in line.split()[3:5] ] count += 1 # self.start_time['triple'], self.end_time['triple'] = self.create_year2id(triple_time,'triple') with open(self.p.entity2id, 'r', encoding="utf-8") as filein2: for line in filein2: # entity_time[int(line.split('\t')[1])]=[x.split()[0] for x in line.split()[2:4]] max_ent = max_ent + 1 self.year2id = self.create_year2id(triple_time) # self.start_time['entity'], self.end_time['entity'] = self.create_year2id(entity_time,'entiy') # self.inp_idx['entity'],self.start_idx['entity'], self.end_idx['entity'] = self.create_id_labels(entity_time,'entity') self.inp_idx['triple'], self.start_idx['triple'], self.end_idx[ 'triple'] = self.create_id_labels(triple_time, 'triple') #pdb.set_trace() for i, ele in enumerate(self.inp_idx['entity']): if self.start_idx['entity'][i] > self.end_idx['entity'][i]: print(self.inp_idx['entity'][i], self.start_idx['entity'][i], self.end_idx['entity'][i]) self.num_class = len(self.year2id.keys()) # for dtype in ['entity','triple']: # self.labels[dtype] = self.getOneHot(self.start_idx[dtype],self.end_idx[dtype], self.num_class)# Representing labels by one hot notation keep_idx = set(self.inp_idx['triple']) for i in range(len(train_triples) - 1, -1, -1): if i not in keep_idx: del train_triples[i] with open(self.p.relation2id, 'r') as filein3: for line in filein3: max_rel = max_rel + 1 index = randint(1, len(train_triples)) - 1 posh, rela, post = zip(*train_triples) head, rel, tail = zip(*train_triples) posh = list(posh) post = list(post) rela = list(rela) head = list(head) tail = list(tail) rel = list(rel) for i in range(len(posh)): if self.start_idx['triple'][i] < self.end_idx['triple'][i]: for j in range(self.start_idx['triple'][i] + 1, self.end_idx['triple'][i] + 1): head.append(posh[i]) rel.append(rela[i]) tail.append(post[i]) self.start_idx['triple'].append(j) self.ph, self.pt, self.r,self.nh, self.nt , self.triple_time = [], [], [], [], [], [] for triple in range(len(head)): neg_set = set() for k in range(self.p.M): possible_head = randint(0, max_ent - 1) while (possible_head, rel[triple], tail[triple] ) in triple_set or (possible_head, rel[triple], tail[triple]) in neg_set: possible_head = randint(0, max_ent - 1) self.nh.append(possible_head) self.nt.append(tail[triple]) self.r.append(rel[triple]) self.ph.append(head[triple]) self.pt.append(tail[triple]) self.triple_time.append(self.start_idx['triple'][triple]) neg_set.add((possible_head, rel[triple], tail[triple])) for triple in range(len(tail)): neg_set = set() for k in range(self.p.M): possible_tail = randint(0, max_ent - 1) while (head[triple], rel[triple], possible_tail ) in triple_set or (head[triple], rel[triple], possible_tail) in neg_set: possible_tail = randint(0, max_ent - 1) self.nh.append(head[triple]) self.nt.append(possible_tail) self.r.append(rel[triple]) self.ph.append(head[triple]) self.pt.append(tail[triple]) self.triple_time.append(self.start_idx['triple'][triple]) neg_set.add((head[triple], rel[triple], possible_tail)) # self.triple_time = triple_time # self.entity_time = entity_time self.max_rel = max_rel self.max_ent = max_ent self.max_time = len(self.year2id.keys()) self.data = list( zip(self.ph, self.pt, self.r, self.nh, self.nt, self.triple_time)) self.data = self.data + self.data[0:self.p.batch_size]
def parse_evidence( fixed_labels=None, evidence_files=None, molecules=None, evidence_score_field=None, return_raw_csv_data=False, ): """ Reads in the evidence file and returns the final formatted fixed labels, the evidence lookup, which is passed to the isotopologue library and the final formatted molecules (fixed labels are stripped form the molecules). Note: Output .csv files from `Ursgal`_ (`Documentation`_) can directly be used. Also `mzTab`_ files can be used as input. .. _Ursgal: https://github.com/ursgal/ursgal .. _Documentation: http://ursgal.readthedocs.io/en/latest/ .. _mzTab: http://www.psidev.info/mztab Args: fixed_labels (dict): dict with fixed labels, example format is shown below. evidence_files (list): list of evidence file paths. molecules (list): list of additional molecules evidence_score_field (str): specify fieldname which holds the search engine score (Default is "PEP") Example fixed label format:: { 'C' : [ { 'element': { 'O': 1, 'H': 3, '14N': 1, 'C': 2 }, 'evidence_mod_name': 'Carbamidomethyl' }, ] } Returns: tuple: final formatted fixed label dict, evidence lookup, list of molecules """ if molecules is None: molecules = [] if evidence_score_field is None: evidence_score_field = "PEP" # default unimod_parser = pyqms.UnimodMapper() fixed_mod_lookup = {} amino_acid_2_fixed_mod_name = ddict(list) formatted_fixed_labels = None evidence_lookup = None molecule_set = set() all_fixed_mod_names = set() if fixed_labels is not None and len(fixed_labels.keys()) != 0: formatted_fixed_labels = {} for aa, fixed_mod_info_dict_list in fixed_labels.items(): for fixed_mod_info_dict in fixed_mod_info_dict_list: if isinstance(fixed_mod_info_dict["element_composition"], dict): tmp_cc_factory = pyqms.chemical_composition.ChemicalComposition( ) tmp_cc_factory.add_chemical_formula( fixed_mod_info_dict["element_composition"]) else: tmp_cc_factory = fixed_mod_info_dict["element_composition"] # print(type(tmp_cc_factory)) # print(fixed_mod_info_dict) if aa not in formatted_fixed_labels.keys(): formatted_fixed_labels[aa] = [] formatted_fixed_labels[aa].append( tmp_cc_factory.hill_notation_unimod()) # save it under name and amino acid! fixed_mod_lookup[fixed_mod_info_dict[ "evidence_mod_name"]] = dc(tmp_cc_factory) amino_acid_2_fixed_mod_name[aa].append( fixed_mod_info_dict["evidence_mod_name"]) all_fixed_mod_names.add( fixed_mod_info_dict["evidence_mod_name"]) tmp_cc_factory.clear() cc_factory = pyqms.chemical_composition.ChemicalComposition() # this is the lookup for the lib with the evidences # tmp_evidences = ddict(list) tmp_evidences = {} csv_raw_data_to_return = {} # tmp_charges_of_evidences = set() for evidence_file in evidence_files: input_is_csv = False evidence_lookup = {} with codecs.open(evidence_file, mode="r", encoding="utf-8") as openend_evidence_file: # first buffer the file here depending on mztab andf csv input if evidence_file.upper().endswith("CSV"): dict_reader = csv.DictReader(openend_evidence_file) modification_fieldname = "Modifications" rt_fieldname = "Retention Time (s)" seq_fieldname = "Sequence" input_is_csv = True elif evidence_file.upper().endswith("MZTAB"): dict_reader = csv.DictReader( [ row for row in openend_evidence_file if row[:3] in ["PSM", "PSH"] ], delimiter="\t", ) modification_fieldname = "modifications" rt_fieldname = "retention_time" seq_fieldname = "sequence" else: print( "The format {0} is not recognized by the pyQms adaptor function" .format(os.path.splitext(evidence_file)[1])) input_buffer = [] for line_dict in dict_reader: input_buffer.append(line_dict) csv_raw_data_to_return[evidence_file] = input_buffer for line_dict in input_buffer: modifications = line_dict.get(modification_fieldname, "") if modifications == "": molecule = line_dict[seq_fieldname] else: if input_is_csv: formatted_mods = line_dict[modification_fieldname] else: formatted_mods = [] # 2-UNIMOD:4,3-UNIMOD:4 for pos_and_unimod_id in line_dict[ modification_fieldname].split(","): pos, unimod_id = pos_and_unimod_id.split("-") unimod_name = unimod_parser.id2name( unimod_id.split(":")[1]) formatted_mods.append("{0}:{1}".format( unimod_name, pos)) formatted_mods = ";".join(formatted_mods) molecule = "{0}#{1}".format(line_dict[seq_fieldname], formatted_mods) dict_2_append = {} rt = line_dict.get(rt_fieldname, "") # seconds is the standard also for mzTab if rt != "": dict_2_append["RT"] = float(rt) / 60.0 # always in min score = line_dict.get(evidence_score_field, "") if score != "": dict_2_append["score"] = float(score) dict_2_append["score_field"] = evidence_score_field else: dict_2_append["score"] = "None" dict_2_append["score_field"] = "None" if molecule not in tmp_evidences.keys(): tmp_evidences[molecule] = { "evidences": [], "trivial_names": set() } tmp_evidences[molecule]["evidences"].append(dict_2_append) for trivial_name_key in [ "proteinacc_start_stop_pre_post_;", # old ursgal style "trivial_name", # self defined name "Protein ID", # new ursgal style "accession", # mzTab style ]: additional_name = line_dict.get(trivial_name_key, "") if additional_name != "": # use set to remove double values tmp_evidences[molecule]["trivial_names"].add( additional_name) mod_pattern = re.compile(r""":(?P<pos>[0-9]*$)""") all_molecules = list(molecules) if len(tmp_evidences.keys()) > 0: all_molecules += list(tmp_evidences.keys()) for molecule_and_mods in sorted(all_molecules): # try to convert trivial name set to list for conveniences try: tmp_evidences[molecule_and_mods]["trivial_names"] = sorted( list(set(tmp_evidences[molecule_and_mods]["trivial_names"]))) except: pass # print(molecule_and_mods) if "#" in molecule_and_mods: molecule, modifications = molecule_and_mods.split("#") else: molecule = molecule_and_mods modifications = None fixed_label_mod_addon_names = [] if modifications is not None: mods_to_delete = [] mod_list = modifications.split(";") for pos_in_mod_list, mod_and_pos in enumerate(mod_list): # OLD STYLE, no ':' in mod allowed! # mod, pos = mod_and_pos.split(':') # NEW STYLE, SILAC does not crash... for match in mod_pattern.finditer(mod_and_pos): pos = int(match.group("pos")) mod = mod_and_pos[:match.start()] break modded_aa = molecule[int(pos) - 1] if (formatted_fixed_labels is not None and modded_aa in formatted_fixed_labels.keys() and mod in all_fixed_mod_names): fixed_label_mod_addon_names.append(mod) mods_to_delete.append(pos_in_mod_list) for modpos_2_remove in sorted(mods_to_delete, reverse=True): mod_list.pop(modpos_2_remove) if len(mod_list) > 0: molecule = "{0}#{1}".format(molecule, ";".join(mod_list)) else: # nosetest does not line else and pass # molecule = molecule pass else: # fail check if fixed mod is not in the modifications! # add all fixed modification! if formatted_fixed_labels is not None: for aa in molecule: if aa in formatted_fixed_labels.keys(): for mod_name in amino_acid_2_fixed_mod_name[aa]: fixed_label_mod_addon_names.append(mod_name) # print(molecule) if molecule.startswith("+"): cc_factory.add_chemical_formula(molecule) else: cc_factory.use(molecule) if len(fixed_label_mod_addon_names) != 0: for fixed_mod_name in fixed_label_mod_addon_names: cc_factory.add_chemical_formula( fixed_mod_lookup[fixed_mod_name]) complete_formula = cc_factory.hill_notation_unimod() molecule_set.add(molecule) if molecule_and_mods in tmp_evidences.keys(): if complete_formula not in evidence_lookup.keys(): evidence_lookup[complete_formula] = {} evidence_lookup[complete_formula][ molecule_and_mods] = tmp_evidences[molecule_and_mods] cc_factory.clear() molecule_list = list(molecule_set) if return_raw_csv_data: return ( formatted_fixed_labels, evidence_lookup, molecule_list, csv_raw_data_to_return, ) else: return formatted_fixed_labels, evidence_lookup, molecule_list
def __init__( self, filter_freq=200, filter_stages=[], url_stats='result/matrix?show_stage_details=true&show_item_details=true', url_rules='formula', path_stats='data/matrix.json', path_rules='data/formula.json', update=False, banned_stages={}, # expValue=30, ConvertionDR=0.18, display_main_only=True): """ Object initialization. Args: filter_freq: int or None. The lowest frequence that we consider. No filter will be applied if None. url_stats: string. url to the dropping rate stats data. url_rules: string. url to the composing rules data. path_stats: string. local path to the dropping rate stats data. path_rules: string. local path to the composing rules data. """ try: material_probs, self.convertion_rules = load_data( path_stats, path_rules) except: print( 'exceptRequesting data from web resources (i.e., penguin-stats.io)...', end=' ') material_probs, self.convertion_rules = request_data( penguin_url + url_stats, penguin_url + url_rules, path_stats, path_rules) print('done.') if update: print( 'Requesting data from web resources (i.e., penguin-stats.io)...', end=' ') material_probs, self.convertion_rules = request_data( penguin_url + url_stats, penguin_url + url_rules, path_stats, path_rules) print('done.') self.exp_factor = 1 self.material_probs = material_probs self.banned_stages = banned_stages self.display_main_only = display_main_only self.stage_times = ddict(int) filtered_probs = [] needed_stage = [] for dct in material_probs['matrix']: if dct['times'] > self.stage_times[dct['stage'][ 'code']] or self.stage_times[dct['stage']['code']] == 0: self.stage_times[dct['stage']['code']] = dct['times'] if dct['times'] >= filter_freq and dct['stage'][ 'code'] not in filter_stages: filtered_probs.append(dct) elif dct['stage']['code'] not in needed_stage: needed_stage.append(dct['stage']['code']) material_probs['matrix'] = filtered_probs self.ConvertionDR = ConvertionDR self._pre_processing(material_probs) self._set_lp_parameters()
def group_styles(self): ''' Parses self.items() and build up lookups. Additionally, consistency check is performed to guarantee that each engine is mapping only on one style. The lookup build and returned looks like:: lookup = { 'style_2_engine' : { 'xtandem_style_1' : [ 'xtandem_sledgehamer', 'xtandem_cylone', ... ], 'omssa_style_1' ... }, # This is done during uNode initializations # each unode will register its style with umapmaster # 'engine_2_style' : { 'xtandem_sledgehamer' : 'xtandem_style_1', ... }, 'engine_2_params' : { 'xtandem_sledgehamer' : [ uparam1, uparam2, ...], ... }, 'style_2_params' : { 'xtandem_style_1' : [ uparam1, uparam2, ... ], ... }, 'params_triggering_rerun' : { 'xtandem_style_1' : [ uparam1, uparam2 .... ] } } ''' lookup = { 'style_2_engine': ddict(set), 'engine_2_style': {}, # these two are not in the docu yet ... 'engine_2_params': ddict(list), 'style_2_params': ddict(list), 'params_triggering_rerun': ddict(list) } for uparam, udict in sorted(self.items()): # print( uparam, end = '\t') # if uparam == 'force': # print(udict) for style in udict['ukey_translation'].keys(): try: style_basename, style_version = style.split('_style_') except: print('Syntax Error @ uparam {0}'.format(uparam)) print('style : {0}'.format(style)) exit(1) vvv = False if style_basename == 'ucontroller': vvv = True # print('UController params {0}'.format( uparam )) # else: # print() styles_seen = set() for engine in udict['available_in_unode']: # if vvv: # print(engine ) if style_basename not in engine: continue # this style 2 engine lookup is not quite right ... # This function requires unode meta info for proper # mapping .... # lookup['style_2_engine'][ style ].add( engine ) lookup['engine_2_params'][engine].append(uparam) if style not in styles_seen: lookup['style_2_params'][style].append(uparam) if udict.get('triggers_rerun', True): lookup['params_triggering_rerun'][style].append( uparam) styles_seen.add(style) parsed_e2s = lookup['engine_2_style'].get(engine, None) if parsed_e2s is None: lookup['engine_2_style'][engine] = style else: if parsed_e2s != style: print('{0} was found to map on style {1} and {2}'. format(engine, parsed_e2s, style)) return lookup
def __init__(self, prop, **kwargs): super(PropMerge, self).__init__(name=prop, **kwargs) self.seen = set() self._syms = ddict(list)