def guess_mnemo(cls, bs, attrib, pre_dis_info, offset): candidates = [] candidates = set() fname_values = pre_dis_info todo = [ (dict(fname_values), branch, offset * 8) for branch in list(viewitems(cls.bintree)) ] for fname_values, branch, offset_b in todo: (l, fmask, fbits, fname, flen), vals = branch if flen is not None: l = flen(attrib, fname_values) if l is not None: try: v = cls.getbits(bs, attrib, offset_b, l) except IOError: # Raised if offset is out of bound continue offset_b += l if v & fmask != fbits: continue if fname is not None and not fname in fname_values: fname_values[fname] = v for nb, v in viewitems(vals): if 'mn' in nb: candidates.update(v) else: todo.append((dict(fname_values), (nb, v), offset_b)) return [c for c in candidates]
def dropped_samples(self): """The samples that were selected but dropped in processing Returns ------- dict of sets Format is {artifact_id: {sample_id, sample_id, ...}, ...} """ with qdb.sql_connection.TRN: bioms = self.biom_tables if not bioms: return {} # get all samples selected for the analysis, converting lists to # sets for fast searching. Overhead less this way # for large analyses all_samples = {k: set(v) for k, v in viewitems(self.samples)} for biom, filepath in viewitems(bioms): table = load_table(filepath) ids = set(table.ids()) for k in all_samples: all_samples[k] = all_samples[k] - ids # what's left are unprocessed samples, so return return all_samples
def _redundantFree(self, blocks): """ Redundant-free Comparisons from Kolb et al, "Dedoop: Efficient Deduplication with Hadoop" http://dbs.uni-leipzig.de/file/Dedoop.pdf """ coverage = defaultdict(list) for block_id, records in enumerate(blocks): for record_id, record in viewitems(records): coverage[record_id].append(block_id) for block_id, records in enumerate(blocks): if block_id % 10000 == 0: logger.info("%s blocks" % block_id) marked_records = [] for record_id, record in viewitems(records): smaller_ids = {covered_id for covered_id in coverage[record_id] if covered_id < block_id} marked_records.append((record_id, record, smaller_ids)) yield marked_records
def factor_one_bit(tree): if isinstance(tree, set): return tree new_keys = defaultdict(lambda: defaultdict(dict)) if len(tree) == 1: return tree for k, v in viewitems(tree): if k == "mn": new_keys[k] = v continue l, fmask, fbits, fname, flen = k if flen is not None or l <= 1: new_keys[k] = v continue cfmask = fmask >> (l - 1) nfmask = fmask & ((1 << (l - 1)) - 1) cfbits = fbits >> (l - 1) nfbits = fbits & ((1 << (l - 1)) - 1) ck = 1, cfmask, cfbits, None, flen nk = l - 1, nfmask, nfbits, fname, flen if nk in new_keys[ck]: raise NotImplementedError('not fully functional') new_keys[ck][nk] = v for k, v in list(viewitems(new_keys)): new_keys[k] = factor_one_bit(v) # try factor sons if len(new_keys) != 1: return new_keys subtree = next(iter(viewvalues(new_keys))) if len(subtree) != 1: return new_keys if next(iter(subtree)) == 'mn': return new_keys return new_keys
def dropped_samples(self): """The samples that were selected but dropped in processing Returns ------- dict of sets Format is {processed_data_id: {sample_id, sample_id, ...}, ...} """ bioms = self.biom_tables if not bioms: return {} # get all samples selected for the analysis, converting lists to # sets for fast searching. Overhead less this way for large analyses all_samples = {k: set(v) for k, v in viewitems(self.samples)} for biom, filepath in viewitems(bioms): table = load_table(filepath) # remove the samples from the sets as they are found in the table proc_data_ids = set(sample['Processed_id'] for sample in table.metadata()) ids = set(table.ids()) for proc_data_id in proc_data_ids: all_samples[proc_data_id] = all_samples[proc_data_id] - ids # what's left are unprocessed samples, so return return all_samples
def post(self): barcodes = self.get_argument('barcodes').split(',') if self.get_argument('blanks'): blanks = self.get_argument('blanks').split(',') else: blanks = [] if self.get_argument('external'): external = self.get_argument('external').split(',') else: external = [] # Get metadata and create zip file metadata, failures = db.pulldown(barcodes, blanks, external) meta_zip = InMemoryZip() failed = '\n'.join(['\t'.join(bc) for bc in viewitems(failures)]) failtext = ("The following barcodes were not retrieved " "for any survey:\n%s" % failed) meta_zip.append("failures.txt", failtext) for survey, meta in viewitems(metadata): meta_zip.append('survey_%s_md.txt' % survey, meta) # write out zip file self.add_header('Content-type', 'application/octet-stream') self.add_header('Content-Transfer-Encoding', 'binary') self.add_header('Accept-Ranges', 'bytes') self.add_header('Content-Encoding', 'none') self.add_header('Content-Disposition', 'attachment; filename=metadata.zip') self.write(meta_zip.write_to_buffer()) self.flush() self.finish()
def to_object(data, field_maps, cls=None, wrap=True, dc=None): """It does below stuffs 1. Convert the data to common convention, 2. Wrap the converted data to provided class and return the instance of that class :param dc: datacenter name to be set as an attribute on all objects :param data: this data should be either a list of dicts or dict itself :param field_maps: a dict of mapping values in form of {mapping: original} :param cls: Class to which the data to be wrapped :param wrap: Whether to wrap the data in the class or not :return: instance of cls """ if isinstance(data, list): new_data = [] for instance in data: if dc: instance.update({"datacenter": dc}) field_maps.update({"datacenter": "datacenter"}) new_data.append({ mapping: instance.get(orig, None) for mapping, orig in viewitems(field_maps) }) if wrap: return [cls(**instance) for instance in new_data] else: return new_data elif isinstance(data, dict): if dc: data.update({"datacenter": dc}) field_maps.update({"datacenter": "datacenter"}) new_data = {mapping: data.get(orig, None) for mapping, orig in viewitems(field_maps)} if wrap: return cls(**new_data) else: return new_data
def rollout(env, agent, timestep_limit): """ Simulate the env and agent for timestep_limit steps """ ob = env.reset() terminated = False data = defaultdict(list) for _ in range(timestep_limit): ob = agent.obfilt(ob) data["observation"].append(ob) action, agentinfo = agent.act(ob) data["action"].append(action) for (k,v) in viewitems(agentinfo): data[k].append(v) ob,rew,done,envinfo = env.step(action) data["reward"].append(rew) rew = agent.rewfilt(rew) data["reward_filt"] = rew for (k,v) in viewitems(envinfo): data[k].append(v) if done: terminated = True break data = {k:np.array(v) for (k,v) in viewitems(data)} data["terminated"] = terminated return data
def getChainsFromConnections(connections,checkConnections=True): '''Take a list of connections and return a list of connection chains connections is a dictionary of connections between elements (which must be hashable) and can be generated using getElementConnections The checkConnections option tests that there is only one one path through each point (aka 2 or fewer connections, no branching) Returns a list of chains (lists of elements) ''' connections = deepcopy(connections) # Protect the input from modification if checkConnections: # Check that there is no branching assert all( len(v)<3 for k,v in viewitems(connections) ), 'Aborting; this network has branching' chains = [] while len(connections): # loop over possible chains # Pick a starting point (an end point if possible) currPt = _firstOrOther([pt for pt,conn in viewitems(connections) if len(conn)==1], next(iter(connections))) # was connections.keys()[0] # Form a chain and move the current point forward chain = [currPt] currPt = connections.pop(currPt)[0] while currPt: # loop to fill a chain, stop on an invalid chain.append(currPt) if len(connections)==0: break connections[currPt] = deletecases(connections[currPt], [chain[-2]]) currPt = _firstOrOther(connections.pop(currPt,[])) chains.append(chain) return chains
def setUp(self) : data_d = { 100 : {"name": "Bob", "age": "50", "dataset": 0}, 105 : {"name": "Charlie", "age": "75", "dataset": 1}, 110 : {"name": "Meredith", "age": "40", "dataset": 1}, 115 : {"name": "Sue", "age": "10", "dataset": 0}, 120 : {"name": "Jimbo", "age": "21","dataset": 0}, 125 : {"name": "Jimbo", "age": "21", "dataset": 0}, 130 : {"name": "Willy", "age": "35", "dataset": 0}, 135 : {"name": "Willy", "age": "35", "dataset": 1}, 140 : {"name": "Martha", "age": "19", "dataset": 1}, 145 : {"name": "Kyle", "age": "27", "dataset": 0}, } self.blocker = dedupe.blocking.Blocker([dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")]) self.records_1 = dict((record_id, record) for record_id, record in viewitems(data_d) if record["dataset"] == 0) self.fields_2 = dict((record_id, record["name"]) for record_id, record in viewitems(data_d) if record["dataset"] == 1)
def compute(asm, inputstate={}, debug=False): loc_db = LocationDB() sympool = dict(regs_init) sympool.update({k: ExprInt(v, k.size) for k, v in viewitems(inputstate)}) ir_tmp = ir_arch(loc_db) ircfg = ir_tmp.new_ircfg() symexec = SymbolicExecutionEngine(ir_tmp, sympool) instr = mn.fromstring(asm, loc_db, "l") code = mn.asm(instr)[0] instr = mn.dis(code, "l") instr.offset = inputstate.get(PC, 0) lbl = ir_tmp.add_instr_to_ircfg(instr, ircfg) symexec.run_at(ircfg, lbl) if debug: for k, v in viewitems(symexec.symbols): if regs_init.get(k, None) != v: print(k, v) out = {} for k, v in viewitems(symexec.symbols): if k in EXCLUDE_REGS: continue elif regs_init.get(k, None) == v: continue elif isinstance(v, ExprInt): out[k] = int(v) else: out[k] = v return out
def coveredPairs(self, blocker, records_1, records_2): cover = {} pair_enumerator = core.Enumerator() for predicate in blocker.predicates: cover[predicate] = collections.defaultdict(lambda: (set(), set())) for id, record in viewitems(records_2): blocks = predicate(record, target=True) for block in blocks: cover[predicate][block][1].add(id) current_blocks = set(cover[predicate]) for id, record in viewitems(records_1): blocks = set(predicate(record)) for block in blocks & current_blocks: cover[predicate][block][0].add(id) for predicate, blocks in cover.items(): pairs = {pair_enumerator[pair] for A, B in blocks.values() for pair in itertools.product(A, B)} cover[predicate] = Counter(pairs) return cover
def restore_snapshot(self, snapshot, memory=True): """Restore a @snapshot taken with .take_snapshot @snapshot: .take_snapshot output @memory: (optional) if set, also restore the memory """ # Restore memory if memory: self.jitter.vm.reset_memory_page_pool() self.jitter.vm.reset_code_bloc_pool() for addr, metadata in viewitems(snapshot["mem"]): self.jitter.vm.add_memory_page( addr, metadata["access"], metadata["data"] ) # Restore registers self.jitter.pc = snapshot["regs"][self.ir_arch.pc.name] for reg, value in viewitems(snapshot["regs"]): setattr(self.jitter.cpu, reg, value) # Reset intern elements self.jitter.vm.set_exception(0) self.jitter.cpu.set_exception(0) self.jitter.bs._atomic_mode = False # Reset symb exec for key, _ in list(viewitems(self.symb.symbols)): del self.symb.symbols[key] for expr, value in viewitems(snapshot["symb"]): self.symb.symbols[expr] = value
def factor_fields(tree): if not isinstance(tree, dict): return tree if len(tree) != 1: return tree # merge k1, v1 = next(iter(viewitems(tree))) if k1 == "mn": return tree l1, fmask1, fbits1, fname1, flen1 = k1 if fname1 is not None: return tree if flen1 is not None: return tree if not isinstance(v1, dict): return tree if len(v1) != 1: return tree k2, v2 = next(iter(viewitems(v1))) if k2 == "mn": return tree l2, fmask2, fbits2, fname2, flen2 = k2 if fname2 is not None: return tree if flen2 is not None: return tree l = l1 + l2 fmask = (fmask1 << l2) | fmask2 fbits = (fbits1 << l2) | fbits2 fname = fname2 flen = flen2 k = l, fmask, fbits, fname, flen new_keys = {k: v2} return new_keys
def write_script_rule(self, inputs, outputs, parameters, shell_template, rule_name): assert '_bash_' not in parameters first_output_name, first_output_fn = outputs.items()[0] # for rundir, since we cannot sub wildcards in shell if not rule_name: rule_name = os.path.dirname(first_output_fn) rule_name = self.unique_rule_name(self.legalize(rule_name)) wildcard_rundir = os.path.normpath(os.path.dirname(first_output_fn)) # unsubstituted # We use snake_string_path b/c normpath drops leading ./, but we do NOT want abspath. input_kvs = ', '.join('%s=%s'%(k, snake_string_path(v)) for k,v in sorted(viewitems(inputs))) output_kvs = ', '.join('%s=%s'%(k, snake_string_path(v)) for k,v in sorted(viewitems(outputs))) rule_parameters = {k: v for (k, v) in viewitems(parameters) if not k.startswith('_')} #rule_parameters['reltopdir'] = os.path.relpath('.', wildcard_rundir) # in case we need this later params = ','.join('\n %s="%s"'%(k,v) for (k, v) in viewitems(rule_parameters)) shell = snake_shell(shell_template, wildcard_rundir) # cd $(dirname '{output.%(first_output_name)s}') rule = """ rule static_%(rule_name)s: input: %(input_kvs)s output: %(output_kvs)s params:%(params)s shell: ''' outdir=$(dirname {output[0]}) #mkdir -p ${{outdir}} cd ${{outdir}} date %(shell)s date ''' """%(locals()) self.write(rule)
def comparisons(self, cover, compound_length) : CP = predicates.CompoundPredicate block_index = {} for predicate, blocks in viewitems(cover): block_index[predicate] = {} for block_id, blocks in viewitems(blocks) : for id in self._blocks(blocks) : block_index[predicate].setdefault(id, set()).add(block_id) compounder = self.Compounder(cover, block_index) comparison_count = {} simple_predicates = sorted(cover, key=str) for i in range(2, compound_length+1) : for combo in itertools.combinations(simple_predicates, i) : comparison_count[CP(combo)] = sum(self.pairs(ids) for ids in viewvalues(compounder(combo))) for pred in simple_predicates : comparison_count[pred] = sum(self.pairs(ids) for ids in viewvalues(cover[pred])) return comparison_count
def calculate_repr(self): result = [ "[{0}-{1}->{2}]".format(previous, count, nnext) for previous, edges in viewitems(self.edges) for nnext, count in viewitems(edges) ] self.repr = "G({0})".format(", ".join(result))
def _filter(self, filters, data): for m, o in viewitems(self.field_map): if m in filters and o not in filters: filters[o] = filters.pop(m, None) return [i for i in data if all([True if i.get(k) == v else False for k, v in viewitems(filters)])]
def expectedLabelPosition(peptide, labelStateInfo, sequence=None, modPositions=None): """Returns a modification description of a certain label state of a peptide. :param peptide: Peptide sequence used to calculat the expected label state modifications :param labelStateInfo: An entry of :attr:`LabelDescriptor.labels` that describes a label state :param sequence: unmodified amino acid sequence of :var:`peptide`, if None it is generated by :func:`maspy.peptidemethods.removeModifications()` :param modPositions: dictionary describing the modification state of "peptide", if None it is generated by :func:`maspy.peptidemethods.returnModPositions()` :returns: {sequence position: sorted list of expected label modifications on that position, ... } """ if modPositions is None: modPositions = maspy.peptidemethods.returnModPositions(peptide, indexStart=0 ) if sequence is None: sequence = maspy.peptidemethods.removeModifications(peptide) currLabelMods = dict() for labelPosition, labelSymbols in viewitems(labelStateInfo['aminoAcidLabels']): labelSymbols = aux.toList(labelSymbols) if labelSymbols == ['']: pass elif labelPosition == 'nTerm': currLabelMods.setdefault(0, list()) currLabelMods[0].extend(labelSymbols) else: for sequencePosition in aux.findAllSubstrings(sequence, labelPosition): currLabelMods.setdefault(sequencePosition, list()) currLabelMods[sequencePosition].extend(labelSymbols) if labelStateInfo['excludingModifications'] is not None: for excludingMod, excludedLabelSymbol in viewitems(labelStateInfo['excludingModifications']): if excludingMod not in modPositions: continue for excludingModPos in modPositions[excludingMod]: if excludingModPos not in currLabelMods: continue if excludedLabelSymbol not in currLabelMods[excludingModPos]: continue if len(currLabelMods[excludingModPos]) == 1: del(currLabelMods[excludingModPos]) else: excludedModIndex = currLabelMods[excludingModPos].index(excludedLabelSymbol) currLabelMods[excludingModPos].pop(excludedModIndex) for sequencePosition in list(viewkeys(currLabelMods)): currLabelMods[sequencePosition] = sorted(currLabelMods[sequencePosition]) return currLabelMods
def blockTraining(pairs, predicate_set, eta=.1, epsilon=0, matching = "Dedupe"): ''' Takes in a set of training pairs and predicates and tries to find a good set of blocking rules. ''' blocker = blocking.Blocker(predicate_set) prepare_index(blocker, pairs, matching) if len(pairs['match']) < 50 : compound_length = 2 else : compound_length = 3 dupe_cover = cover(blocker, pairs['match'], compound_length) distinct_cover = cover(blocker, pairs['distinct'], compound_length) distinct_count = defaultdict(int, {pred : len(pairs) for pred, pairs in viewitems(distinct_cover)}) # Throw away the predicates that cover too many distinct pairs coverage_threshold = eta * len(pairs['distinct']) logger.info("coverage threshold: %s", coverage_threshold) dupe_cover = {pred : pairs for pred, pairs in viewitems(dupe_cover) if distinct_count[pred] < coverage_threshold} if not dupe_cover : raise ValueError(NO_PREDICATES_ERROR) uncoverable_dupes = set(pairs['match']) - set.union(*viewvalues(dupe_cover)) if len(uncoverable_dupes) > epsilon : logger.warning(OUT_OF_PREDICATES_WARNING) logger.debug(uncoverable_dupes) epsilon = 0 else : epsilon -= len(uncoverable_dupes) chvatal_set = greedy(dupe_cover.copy(), distinct_count, epsilon) dupe_cover = {pred : dupe_cover[pred] for pred in chvatal_set} final_predicates = tuple(dominating(dupe_cover)) logger.info('Final predicate set:') for predicate in final_predicates : logger.info(predicate) return final_predicates
def defaultFetchSiAttrFromSmi(smi, si): """Default method to extract attributes from a spectrum metadata item (sai) and adding them to a spectrum item (si).""" for key, value in viewitems(fetchSpectrumInfo(smi)): setattr(si, key, value) for key, value in viewitems(fetchScanInfo(smi)): setattr(si, key, value) if si.msLevel > 1: for key, value in viewitems(fetchParentIon(smi)): setattr(si, key, value)
def get_project_lib(regen=False): global projectlib if regen is False and projectlib: return projectlib projectlib = {} for project, folder in viewitems(projects.pcbs): projectlib[project] = PCBPrototype(project) for project, folder in viewitems(projects.cable_projects): projectlib[project] = CableProjectPrototype(project) return projectlib
def callback(stats): global COUNTER for (stat,val) in viewitems(stats): diagnostics[stat].append(val) if args.plot: animate_rollout(env, agent, min(500, args.timestep_limit)) print("*********** Iteration %i ****************" % COUNTER) print(tabulate([ (k, v) for k, v in viewitems(stats) if np.asarray(v).size== 1 ])) #pylint: disable=W0110 COUNTER += 1 if args.snapshot_every and ((COUNTER % args.snapshot_every==0) or (COUNTER==args.n_iter)): hdf['/agent_snapshots/%0.4i'%COUNTER] = np.array(cPickle.dumps(agent,-1))
def consistency_check(self): """Ensure internal structures are consistent with each others""" assert set(self._loc_key_to_names).issubset(self._loc_keys) assert set(self._loc_key_to_offset).issubset(self._loc_keys) assert self._loc_key_to_offset == {v: k for k, v in viewitems(self._offset_to_loc_key)} assert reduce( lambda x, y:x.union(y), viewvalues(self._loc_key_to_names), set(), ) == set(self._name_to_loc_key) for name, loc_key in viewitems(self._name_to_loc_key): assert name in self._loc_key_to_names[loc_key]
def _build_biom_tables(self, samples, rarefaction_depth): """Build tables and add them to the analysis""" with qdb.sql_connection.TRN: # filter and combine all study BIOM tables needed for # each data type new_tables = {dt: None for dt in self.data_types} base_fp = qdb.util.get_work_base_dir() for a_id, samps in viewitems(samples): # one biom table attached to each artifact object artifact = qdb.artifact.Artifact(a_id) table_fp = None for _, fp, fp_type in artifact.filepaths: if fp_type == 'biom': table_fp = fp break if not table_fp: raise RuntimeError( "Artifact %s do not have a biom table associated" % a_id) table = load_table(table_fp) # HACKY WORKAROUND FOR DEMO. Issue # 246 # make sure samples not in biom table are not filtered for table_samps = set(table.ids()) filter_samps = table_samps.intersection(samps) # add the metadata column for study the samples come from study_meta = {'Study': artifact.study.title, 'Processed_id': artifact.id} samples_meta = {sid: study_meta for sid in filter_samps} # filter for just the wanted samples and merge into new table # this if/else setup avoids needing a blank table to # start merges table.filter(filter_samps, axis='sample', inplace=True) table.add_metadata(samples_meta, axis='sample') data_type = artifact.data_type if new_tables[data_type] is None: new_tables[data_type] = table else: new_tables[data_type] = new_tables[data_type].merge(table) # add the new tables to the analysis _, base_fp = qdb.util.get_mountpoint(self._table)[0] for dt, biom_table in viewitems(new_tables): # rarefy, if specified if rarefaction_depth is not None: biom_table = biom_table.subsample(rarefaction_depth) # write out the file biom_fp = join(base_fp, "%d_analysis_%s.biom" % (self._id, dt)) with biom_open(biom_fp, 'w') as f: biom_table.to_hdf5(f, "Analysis %s Datatype %s" % (self._id, dt)) self._add_file("%d_analysis_%s.biom" % (self._id, dt), "biom", data_type=dt)
def symb_exec(lbl, ir_arch, ircfg, inputstate, debug): sympool = dict(regs_init) sympool.update(inputstate) symexec = SymbolicExecutionEngine(ir_arch, sympool) symexec.run_at(ircfg, lbl) if debug: for k, v in viewitems(symexec.symbols): if regs_init.get(k, None) != v: print(k, v) return { k: v for k, v in viewitems(symexec.symbols) if k not in EXCLUDE_REGS and regs_init.get(k, None) != v }
def __init__(self, *args, **kwargs): """Reads kwargs as properties of self.""" # perform init on temp dict to preserve interface: will then translate # aliased keys when loading into self temp = {} unalias = self.unalias dict.__init__(temp, *args, **kwargs) for key, val in viewitems(temp): self[unalias(key)] = val for name, prototype in viewitems(self.Required): new_name = unalias(name) if new_name not in self: self[new_name] = self._copy(prototype)
def store_survey(survey, survey_id): """Store the survey Parameters ---------- survey : amgut.lib.data_access.survey.Survey The corresponding survey survey_id : str The corresponding survey ID to retreive from redis """ def get_survey_question_id(key): return int(key.split('_')[-2]) data = redis.hgetall(survey_id) to_store = PartitionResponse(survey.question_types) consent_details = loads(data.pop('consent')) if 'existing' in data: data.pop('existing') for page in data: page_data = loads(data[page]) questions = page_data['questions'] for quest, resps in viewitems(questions): qid = get_survey_question_id(quest) qtype = survey.question_types[qid] if resps is None: resps = {-1} # unspecified multiple choice elif qtype in ['SINGLE', 'MULTIPLE']: resps = set([int(i) for i in resps]) else: pass to_store[qid] = resps with_fk_inserts = [] for qid, indices in viewitems(to_store.with_fk): question = survey.questions[qid] for idx in indices: resp = question.responses[idx] if idx != -1 else survey.unspecified with_fk_inserts.append((survey_id, qid, resp)) without_fk_inserts = [(survey_id, qid, dumps(v)) for qid, v in viewitems(to_store.without_fk)] survey.store_survey(consent_details, with_fk_inserts, without_fk_inserts)
def _update_accumulators(cls, elt, accumulators, on_create_set, on_match_set): on_create_set.extend(["%s.%s = [%s]" % (elt, field, srcfield) for field, (srcfield, _) in viewitems(accumulators)]) on_match_set.extend([ ("%(elt)s.%(field)s = CASE WHEN " + ("" if maxvalue is None else "SIZE(%(elt)s.%(field)s) > %(maxvalue)d OR ") + "%(srcfield)s IN %(elt)s.%(field)s THEN %(elt)s.%(field)s ELSE " + "COALESCE(%(elt)s.%(field)s, []) + %(srcfield)s END") % { "elt": elt, "field": field, "srcfield": srcfield, "maxvalue": maxvalue } for field, (srcfield, maxvalue) in viewitems(accumulators) ])
def _show_dependencies(self): """Show dependencies""" created = self.created departing_arrows = self.departing_arrows self._fix_dependencies() for source, targets in viewitems(departing_arrows): if source not in created: continue for target, style in viewitems(targets): if target not in created or source == target: continue dep = (variable_id(source), variable_id(target)) self.dependencies[dep] = style
def iteritems(self): for dst, src in viewitems(self._assigns): yield dst, src
def items(self): return [(dst, src) for dst, src in viewitems(self._assigns)]
def __eq__(self, other): if set(self.keys()) != set(other.keys()): return False return all(other[dst] == src for dst, src in viewitems(self))
def __str__(self): out = [] for dst, src in sorted(viewitems(self._assigns)): out.append("%s = %s" % (dst, src)) return "\n".join(out)
def _build_biom_tables(self, samples, rarefaction_depth=None, rename_dup_samples=False): """Build tables and add them to the analysis""" with qdb.sql_connection.TRN: base_fp = qdb.util.get_work_base_dir() # this assumes that there is only one reference/pipeline for each # data_type issue #164 new_tables = {dt: None for dt in self.data_types} for aid, samps in viewitems(samples): artifact = qdb.artifact.Artifact(aid) # this is not checking the reference used for picking # issue #164 biom_table_fp = None for _, fp, fp_type in artifact.filepaths: if fp_type == 'biom': biom_table_fp = fp break if not biom_table_fp: raise RuntimeError( "Artifact %s do not have a biom table associated" % aid) biom_table = load_table(biom_table_fp) # filtering samples to keep those selected by the user biom_table_samples = set(biom_table.ids()) selected_samples = biom_table_samples.intersection(samps) biom_table.filter(selected_samples, axis='sample', inplace=True) if rename_dup_samples: ids_map = { _id: "%d.%s" % (aid, _id) for _id in biom_table.ids() } biom_table.update_ids(ids_map, 'sample', True, True) # add the metadata column for study the samples come from, # this is useful in case the user download the bioms study_md = {'Study': artifact.study.title, 'Artifact_id': aid} samples_md = {sid: study_md for sid in selected_samples} biom_table.add_metadata(samples_md, axis='sample') data_type = artifact.data_type # this is not checking the reference used for picking # issue #164 if new_tables[data_type] is None: new_tables[data_type] = biom_table else: new_tables[data_type] = \ new_tables[data_type].merge(biom_table) # add the new tables to the analysis _, base_fp = qdb.util.get_mountpoint(self._table)[0] for dt, biom_table in viewitems(new_tables): if biom_table is None: continue # rarefy, if specified if rarefaction_depth is not None: biom_table = biom_table.subsample(rarefaction_depth) # write out the file biom_fp = join(base_fp, "%d_analysis_%s.biom" % (self._id, dt)) with biom_open(biom_fp, 'w') as f: biom_table.to_hdf5( f, "Analysis %s Datatype %s" % (self._id, dt)) self._add_file("%d_analysis_%s.biom" % (self._id, dt), "biom", data_type=dt)
def __init__(self, symbols): tmp = {} for expr, types in viewitems(symbols): tmp[expr] = frozenset(types) self._symbols = frozenset(viewitems(tmp))
def artifact_post_req(user_id, filepaths, artifact_type, name, prep_template_id, artifact_id=None): """Creates the initial artifact for the prep template Parameters ---------- user_id : str User adding the atrifact filepaths : dict of str Comma-separated list of files to attach to the artifact, keyed by file type artifact_type : str The type of the artifact name : str Name to give the artifact prep_template_id : int or str castable to int Prep template to attach the artifact to artifact_id : int or str castable to int, optional The id of the imported artifact Returns ------- dict of objects A dictionary containing the new artifact ID {'status': status, 'message': message, 'artifact': id} """ prep_template_id = int(prep_template_id) prep = PrepTemplate(prep_template_id) study_id = prep.study_id # First check if the user has access to the study access_error = check_access(study_id, user_id) if access_error: return access_error user = User(user_id) if artifact_id: # if the artifact id has been provided, import the artifact qiita_plugin = Software.from_name_and_version('Qiita', 'alpha') cmd = qiita_plugin.get_command('copy_artifact') params = Parameters.load(cmd, values_dict={ 'artifact': artifact_id, 'prep_template': prep.id }) job = ProcessingJob.create(user, params) else: uploads_path = get_mountpoint('uploads')[0][1] path_builder = partial(join, uploads_path, str(study_id)) cleaned_filepaths = {} for ftype, file_list in viewitems(filepaths): # JavaScript sends us this list as a comma-separated list for fp in file_list.split(','): # JavaScript will send this value as an empty string if the # list of files was empty. In such case, the split will # generate a single element containing the empty string. Check # for that case here and, if fp is not the empty string, # proceed to check if the file exists if fp: # Check if filepath being passed exists for study full_fp = path_builder(fp) exists = check_fp(study_id, full_fp) if exists['status'] != 'success': return { 'status': 'error', 'message': 'File does not exist: %s' % fp } if ftype not in cleaned_filepaths: cleaned_filepaths[ftype] = [] cleaned_filepaths[ftype].append(full_fp) # This should never happen, but it doesn't hurt to actually have # a explicit check, in case there is something odd with the JS if not cleaned_filepaths: return { 'status': 'error', 'message': "Can't create artifact, no files provided." } command = Command.get_validator(artifact_type) job = ProcessingJob.create( user, Parameters.load(command, values_dict={ 'template': prep_template_id, 'files': dumps(cleaned_filepaths), 'artifact_type': artifact_type, 'name': name })) # Submit the job job.submit() r_client.set(PREP_TEMPLATE_KEY_FORMAT % prep.id, dumps({ 'job_id': job.id, 'is_qiita_job': True })) return {'status': 'success', 'message': ''}
def raster_calc(output, equation=None, out_type='byte', extent=None, overwrite=False, be_quiet=False, out_no_data=0, row_block_size=2000, col_block_size=2000, apply_all_bands=False, **kwargs): """ Raster calculator Args: output (str): The output image. equation (Optional[str]): The equation to calculate. out_type (Optional[str]): The output raster storage type. Default is 'byte'. extent (Optional[str]): An image or instance of ``mappy.ropen`` to use for the output extent. Default is None. overwrite (Optional[bool]): Whether to overwrite an existing IDW image. Default is False. be_quiet (Optional[bool]): Whether to be quiet and do not report progress. Default is False. out_no_data (Optional[int]): The output no data value. Default is 0. row_block_size (Optional[int]): The row block chunk size. Default is 2000. col_block_size (Optional[int]): The column block chunk size. Default is 2000. apply_all_bands (Optional[bool]): Whether to apply the equation to all bands. Default is False. **kwargs (str): The rasters to compute. E.g., A='/some_raster1.tif', F='/some_raster2.tif'. Band positions default to 1 unless given as [A]_band. Examples: >>> from mpglue.raster_calc import raster_calc >>> >>> # Multiply image A x image B >>> raster_calc('/output.tif', >>> equation='A * B', >>> A='/some_raster1.tif', >>> B='some_raster2.tif') >>> >>> # Reads as... >>> # Where image A equals 1 AND image B is greater than 5, >>> # THEN write image A, OTHERWISE write 0 >>> raster_calc('/output.tif', >>> equation='where((A == 1) & (B > 5), A, 0)', >>> A='/some_raster1.tif', >>> B='some_raster2.tif') >>> >>> # Use different bands from the same image. The letter given for the >>> # image must be the same for the band, followed by _band. >>> # E.g., for raster 'n', the corresponding band would be 'n_band'. For >>> # raster 'r', the corresponding band would be 'r_band', etc. >>> raster_calc('/output.tif', >>> equation='(n - r) / (n + r)', >>> n='/some_raster.tif', >>> n_band=4, >>> r='/some_raster.tif', >>> r_band=3) Returns: None, writes to ``output``. """ # Set the image dictionary image_dict = dict() info_dict = dict() info_list = list() band_dict = dict() temp_files = list() if isinstance(extent, str): ot_info = raster_tools.ropen(extent) temp_dict = copy(kwargs) for kw, vw in viewitems(kwargs): if isinstance(vw, str): d_name, f_name = os.path.split(vw) f_base, __ = os.path.splitext(f_name) vw_sub = os.path.join(d_name, '{}_temp.vrt'.format(f_base)) raster_tools.translate(vw, vw_sub, format='VRT', projWin=[ ot_info.left, ot_info.top, ot_info.right, ot_info.bottom ]) temp_files.append(vw_sub) temp_dict[kw] = vw_sub kwargs = temp_dict for kw, vw in viewitems(kwargs): if '_band' not in kw: band_dict['{}_band'.format(kw)] = 1 if isinstance(vw, str): image_dict[kw] = vw exec('i_info_{} = raster_tools.ropen(r"{}")'.format(kw, vw)) exec('info_dict["{}"] = i_info_{}'.format(kw, kw)) exec('info_list.append(i_info_{})'.format(kw)) if isinstance(vw, int): band_dict[kw] = vw for key, value in viewitems(image_dict): equation = equation.replace(key, 'marrvar_{}'.format(key)) # Check for NumPy functions. # for np_func in dir(np): # # if 'np.' + np_func in equation: # # equation = 'np.{}'.format(equation) # break for kw, vw in viewitems(info_dict): o_info = copy(vw) break n_bands = 1 if not apply_all_bands else o_info.bands if isinstance(extent, raster_tools.ropen): # Set the extent from an object. overlap_info = extent elif isinstance(extent, str): # Set the extent from an existing image. overlap_info = raster_tools.ropen(extent) else: # Check overlapping extent overlap_info = info_list[0].copy() for i_ in range(1, len(info_list)): # Get the minimum overlapping extent # from all input images. overlap_info = raster_tools.GetMinExtent(overlap_info, info_list[i_]) o_info.update_info(left=overlap_info.left, right=overlap_info.right, top=overlap_info.top, bottom=overlap_info.bottom, rows=overlap_info.rows, cols=overlap_info.cols, storage=out_type, bands=n_bands) if overwrite: overwrite_file(output) out_rst = raster_tools.create_raster(output, o_info) if n_bands == 1: out_rst.get_band(1) block_rows, block_cols = raster_tools.block_dimensions( o_info.rows, o_info.cols, row_block_size=row_block_size, col_block_size=col_block_size) if not be_quiet: ctr, pbar = _iteration_parameters(o_info.rows, o_info.cols, block_rows, block_cols) # Iterate over the minimum overlapping extent. for i in range(0, o_info.rows, block_rows): n_rows = raster_tools.n_rows_cols(i, block_rows, o_info.rows) for j in range(0, o_info.cols, block_cols): n_cols = raster_tools.n_rows_cols(j, block_cols, o_info.cols) # For each image, get the offset and # convert bands in the equation to ndarrays. for key, value in viewitems(image_dict): # exec 'x_off, y_off = vector_tools.get_xy_offsets3(overlap_info, i_info_{})'.format(key) x_off, y_off = vector_tools.get_xy_offsets( image_info=info_dict[key], x=overlap_info.left, y=overlap_info.top, check_position=False)[2:] exec( 'marrvar_{KEY} = info_dict["{KEY}"].read(bands2open=band_dict["{KEY}_band"], i=i+y_off, j=j+x_off, rows=n_rows, cols=n_cols, d_type="float32")' .format(KEY=key)) if '&&' in equation: out_array = np.empty((n_bands, n_rows, n_cols), dtype='float32') for eqidx, equation_ in enumerate(equation.split('&&')): if 'nan_to_num' in equation_: if not equation_.startswith('np.'): equation_ = 'np.' + equation_ equation_ = 'out_array[eqidx] = {}'.format(equation_) exec(equation_) else: out_array[eqidx] = ne.evaluate(equation_) else: if 'nan_to_num' in equation: equation_ = 'out_array = {}'.format(equation) exec(equation_) else: out_array = ne.evaluate(equation) # Set the output no data values. out_array[np.isnan(out_array) | np.isinf(out_array)] = out_no_data if n_bands == 1: out_rst.write_array(out_array, i=i, j=j) else: for lidx in range(0, n_bands): out_rst.write_array(out_array[lidx], i=i, j=j, band=lidx + 1) if not be_quiet: pbar.update(ctr) ctr += 1 if not be_quiet: pbar.finish() # Close the input image. for key, value in viewitems(info_dict): info_dict[key].close() # close the output drivers out_rst.close_all() out_rst = None # Cleanup for temp_file in temp_files: if os.path.isfile(temp_file): os.remove(temp_file)
def search_engine(self, item_name, criteria): """ Call GLPI's search engine syntax. INPUT query in JSON format (/apirest.php#search-items): metacriteria: [ { "link": 'AND' "searchtype": "contais", "field": "name", "value": "search value" } ] RETURNS: GLPIs APIRest JSON formated with result of search in key 'data'. """ # Receive the possible field ids for type item_name # -> to avoid wrong lookups, use uid of fields, but strip item type: # example: {"1": {"uid": "Computer.name"}} gets {"name": 1} field_map = {} opts = self.search_options(item_name) raise Exception(opts) for field_id, field_opts in viewitems(opts): if field_id.isdigit() and 'uid' in field_opts: # support case-insensitive strip from item_name! field_name = re.sub('^' + item_name + '.', '', field_opts['uid'], flags=re.IGNORECASE) field_map[field_name] = int(field_id) uri_query = "%s?" % item_name for idx, c in enumerate(criteria['criteria']): # build field argument if idx == 0: uri = "" else: uri = "&" if 'field' in c and c['field'] is not None: field_name = "" # if int given, use it directly if isinstance(c['field'], int) or c['field'].isdigit(): field_name = int(c['field']) # if name given, try to map to an int elif c['field'] in field_map: field_name = field_map[c['field']] else: raise GlpiInvalidArgument('Cannot map field name "' + c['field'] + '" to ' + 'a field id for ' + str(idx + 1) + '. criterion ' + str(c)) uri = uri + "criteria[%d][field]=%d" % (idx, field_name) else: raise GlpiInvalidArgument('Missing "field" parameter for ' + str(idx + 1) + 'the criteria: ' + str(c)) # build value argument if 'value' not in c or c['value'] is None: uri = uri + "&criteria[%d][value]=" % (idx) else: uri = uri + "&criteria[%d][value]=%s" % (idx, c['value']) # build searchtype argument # -> optional! defaults to "contains" on the server if empty if 'searchtype' in c and c['searchtype'] is not None: uri = (uri + "&criteria[%d][searchtype]=%s".format( idx, c['searchtype'])) else: uri = uri + "&criteria[%d][searchtype]=" % (idx) # link is optional for 1st criterion according to docs... # -> error if not present but more than one criterion if 'link' not in c and idx > 0: raise GlpiInvalidArgument('Missing link type for ' + str(idx + 1) + '. criterion ' + str(c)) elif 'link' in c: uri = uri + "&criteria[%d][link]=%s" % (idx, c['link']) # add this criterion to the query uri_query = uri_query + uri try: if not self.api_has_session(): self.init_api() self.update_uri('search') # TODO: is this call correct? shouldn't this be search_engine()? return self.api_rest.search_options(uri_query) except GlpiException as e: return {'{}'.format(e)}
def displayhost(record, showscripts=True, showtraceroute=True, showos=True, out=sys.stdout): """Displays (on `out`, by default `sys.stdout`) the Nmap scan result contained in `record`. """ line = "Host %s" % utils.force_int2ip(record['addr']) if record.get('hostnames'): line += " (%s)" % '/'.join(x['name'] for x in record['hostnames']) if 'source' in record: line += ' from %s' % record['source'] if record.get('categories'): line += ' (%s)' % ', '.join(record['categories']) if 'state' in record: line += ' (%s' % record['state'] if 'state_reason' in record: line += ': %s' % record['state_reason'] line += ')\n' out.write(line) if 'infos' in record: infos = record['infos'] if 'country_code' in infos or 'country_name' in infos: out.write("\t%s - %s" % (infos.get( 'country_code', '?'), infos.get('country_name', '?'))) if 'city' in infos: out.write(' - %s' % infos['city']) out.write('\n') if 'as_num' in infos or 'as_name' in infos: out.write("\tAS%s - %s\n" % (infos.get('as_num', '?'), infos.get('as_name', '?'))) if 'starttime' in record and 'endtime' in record: out.write("\tscan %s - %s\n" % (record['starttime'], record['endtime'])) for state, counts in viewitems(record.get('extraports', {})): out.write("\t%d ports %s (%s)\n" % (counts["total"], state, ", ".join( "%d %s" % (count, reason) for reason, count in viewitems(counts["reasons"]) if reason != "total"))) ports = record.get('ports', []) ports.sort( key=lambda x: (utils.key_sort_none(x.get('protocol')), x['port'])) for port in ports: if port.get('port') == -1: record['scripts'] = port['scripts'] continue if 'state_reason' in port: reason = " (%s)" % ', '.join([port['state_reason']] + [ "%s=%s" % (field[13:], value) for field, value in viewitems(port) if field.startswith('state_reason_') ]) else: reason = "" if 'service_name' in port: srv = port['service_name'] if 'service_method' in port: srv += ' (%s)' % port['service_method'] for field in [ 'service_product', 'service_version', 'service_extrainfo', 'service_ostype', 'service_hostname' ]: if field in port: srv += ' %s' % port[field] else: srv = "" out.write("\t%-10s%-8s%-22s%s\n" % ('%s/%d' % (port.get('protocol'), port['port']), port['state_state'], reason, srv)) if showscripts: out.writelines(_scriptoutput(port)) if showscripts: scripts = _scriptoutput(record) if scripts: out.write('\tHost scripts:\n') out.writelines(scripts) if showtraceroute and record.get('traces'): for trace in record['traces']: proto = trace['protocol'] if proto in ['tcp', 'udp']: proto += '/%d' % trace['port'] out.write('\tTraceroute (using %s)\n' % proto) hops = trace['hops'] hops.sort(key=lambda hop: hop['ttl']) for hop in hops: out.write('\t\t%3s %15s %7s\n' % ( hop['ttl'], utils.force_int2ip(hop['ipaddr']), hop['rtt'], )) if showos and record.get('os', {}).get('osclass'): osclasses = record['os']['osclass'] maxacc = str(max(int(x['accuracy']) for x in osclasses)) osclasses = [ osclass for osclass in osclasses if osclass['accuracy'] == maxacc ] out.write('\tOS fingerprint\n') for osclass in osclasses: out.write('\t\t%(osfamily)s / %(type)s / %(vendor)s / ' 'accuracy = %(accuracy)s\n' % osclass)
def build_graph(start_addr, type_graph, simplify=False, use_ida_stack=True, dontmodstack=False, loadint=False, verbose=False): machine = guess_machine(addr=start_addr) dis_engine, ira = machine.dis_engine, machine.ira class IRADelModCallStack(ira): def call_effects(self, addr, instr): assignblks, extra = super(IRADelModCallStack, self).call_effects(addr, instr) if use_ida_stack: stk_before = idc.get_spd(instr.offset) stk_after = idc.get_spd(instr.offset + instr.l) stk_diff = stk_after - stk_before print(hex(stk_diff)) call_assignblk = AssignBlock([ ExprAssign(self.ret_reg, ExprOp('call_func_ret', addr)), ExprAssign(self.sp, self.sp + ExprInt(stk_diff, self.sp.size)) ], instr) return [call_assignblk], [] else: if not dontmodstack: return assignblks, extra out = [] for assignblk in assignblks: dct = dict(assignblk) dct = { dst: src for (dst, src) in viewitems(dct) if dst != self.sp } out.append(AssignBlock(dct, assignblk.instr)) return out, extra if verbose: print("Arch", dis_engine) fname = idc.get_root_filename() if verbose: print(fname) bs = bin_stream_ida() loc_db = LocationDB() mdis = dis_engine(bs, loc_db=loc_db) ir_arch = IRADelModCallStack(loc_db) # populate symbols with ida names for addr, name in idautils.Names(): if name is None: continue if (loc_db.get_offset_location(addr) or loc_db.get_name_location(name)): # Symbol alias continue loc_db.add_location(name, addr) if verbose: print("start disasm") if verbose: print(hex(start_addr)) asmcfg = mdis.dis_multiblock(start_addr) entry_points = set([loc_db.get_offset_location(start_addr)]) if verbose: print("generating graph") open('asm_flow.dot', 'w').write(asmcfg.dot()) print("generating IR... %x" % start_addr) ircfg = ir_arch.new_ircfg_from_asmcfg(asmcfg) if verbose: print("IR ok... %x" % start_addr) for irb in list(viewvalues(ircfg.blocks)): irs = [] for assignblk in irb: new_assignblk = { expr_simp(dst): expr_simp(src) for dst, src in viewitems(assignblk) } irs.append(AssignBlock(new_assignblk, instr=assignblk.instr)) ircfg.blocks[irb.loc_key] = IRBlock(loc_db, irb.loc_key, irs) if verbose: out = ircfg.dot() open(os.path.join(tempfile.gettempdir(), 'graph.dot'), 'wb').write(out) title = "Miasm IR graph" head = list(entry_points)[0] if simplify: ircfg_simplifier = IRCFGSimplifierCommon(ir_arch) ircfg_simplifier.simplify(ircfg, head) title += " (simplified)" if type_graph == TYPE_GRAPH_IR: graph = GraphMiasmIR(ircfg, title, None) graph.Show() return class IRAOutRegs(ira): def get_out_regs(self, block): regs_todo = super(IRAOutRegs, self).get_out_regs(block) out = {} for assignblk in block: for dst in assignblk: reg = self.ssa_var.get(dst, None) if reg is None: continue if reg in regs_todo: out[reg] = dst return set(viewvalues(out)) # Add dummy dependency to uncover out regs affectation for loc in ircfg.leaves(): irblock = ircfg.blocks.get(loc) if irblock is None: continue regs = {} for reg in ir_arch.get_out_regs(irblock): regs[reg] = reg assignblks = list(irblock) new_assiblk = AssignBlock(regs, assignblks[-1].instr) assignblks.append(new_assiblk) new_irblock = IRBlock(irblock.loc_db, irblock.loc_key, assignblks) ircfg.blocks[loc] = new_irblock class CustomIRCFGSimplifierSSA(IRCFGSimplifierSSA): def do_simplify(self, ssa, head): modified = super(CustomIRCFGSimplifierSSA, self).do_simplify(ssa, head) if loadint: modified |= load_from_int(ssa.graph, bs, is_addr_ro_variable) return modified def simplify(self, ircfg, head): ssa = self.ircfg_to_ssa(ircfg, head) ssa = self.do_simplify_loop(ssa, head) if type_graph == TYPE_GRAPH_IRSSA: ret = ssa.graph elif type_graph == TYPE_GRAPH_IRSSAUNSSA: ircfg = self.ssa_to_unssa(ssa, head) ircfg_simplifier = IRCFGSimplifierCommon(self.ir_arch) ircfg_simplifier.simplify(ircfg, head) ret = ircfg else: raise ValueError("Unknown option") return ret head = list(entry_points)[0] simplifier = CustomIRCFGSimplifierSSA(ir_arch) ircfg = simplifier.simplify(ircfg, head) open('final.dot', 'w').write(ircfg.dot()) graph = GraphMiasmIR(ircfg, title, None) graph.Show()
def _build_mapping_file(self, samples, rename_dup_samples=False): """Builds the combined mapping file for all samples Code modified slightly from qiime.util.MetadataMap.__add__""" with qdb.sql_connection.TRN: # query to get the latest qiime mapping file sql = """SELECT filepath FROM qiita.filepath JOIN qiita.prep_template_filepath USING (filepath_id) JOIN qiita.prep_template USING (prep_template_id) JOIN qiita.filepath_type USING (filepath_type_id) WHERE filepath_type = 'qiime_map' AND artifact_id IN (SELECT * FROM qiita.find_artifact_roots(%s)) ORDER BY filepath_id DESC LIMIT 1""" _, fp = qdb.util.get_mountpoint('templates')[0] all_ids = set() to_concat = [] for aid, samps in viewitems(samples): qdb.sql_connection.TRN.add(sql, [aid]) qm_fp = qdb.sql_connection.TRN.execute_fetchindex()[0][0] # Parse the mapping file qm = qdb.metadata_template.util.load_template_to_dataframe( join(fp, qm_fp), index='#SampleID') # if we are not going to merge the duplicated samples # append the aid to the sample name if rename_dup_samples: qm['original_SampleID'] = qm.index qm['#SampleID'] = "%d." % aid + qm.index qm['qiita_aid'] = aid samps = ['%d.%s' % (aid, _id) for _id in samps] qm.set_index('#SampleID', inplace=True, drop=True) else: samps = set(samps) - all_ids all_ids.update(samps) qm = qm.loc[samps] to_concat.append(qm) merged_map = pd.concat(to_concat) # forcing QIIME column order cols = merged_map.columns.values.tolist() cols.remove('BarcodeSequence') cols.remove('LinkerPrimerSequence') cols.remove('Description') cols = (['BarcodeSequence', 'LinkerPrimerSequence'] + cols + ['Description']) merged_map = merged_map[cols] # Save the mapping file _, base_fp = qdb.util.get_mountpoint(self._table)[0] mapping_fp = join(base_fp, "%d_analysis_mapping.txt" % self._id) merged_map.to_csv(mapping_fp, index_label='#SampleID', na_rep='unknown', sep='\t') self._add_file("%d_analysis_mapping.txt" % self._id, "plain_text")
def populate(self, mtime): self._mtime = mtime self.collection.subscribe(self.on_event) for entry, item in viewitems(self.collection): self.new_entry(entry, item, self.mtime())
def items(self): return viewitems(self._entries.items())
def main(): try: import argparse parser = argparse.ArgumentParser(description=DESCRIPTION) except ImportError: import optparse parser = optparse.OptionParser(description=DESCRIPTION) parser.parse_args_orig = parser.parse_args parser.parse_args = lambda: parser.parse_args_orig()[0] parser.add_argument = parser.add_option parser.add_argument('--init', '--purgedb', action='store_true', help='Purge or create and initialize the database.') parser.add_argument( '--ensure-indexes', action='store_true', help='Create missing indexes (will lock the database).') parser.add_argument('--node-filters', '-n', nargs="+", help='Filter the results with a list of ivre specific ' 'node textual filters (see WebUI doc in FLOW.md).') parser.add_argument('--flow-filters', '-f', nargs="+", help='Filter the results with a list of ivre specific ' 'flow textual filters (see WebUI doc in FLOW.md).') parser.add_argument('--json', '-j', action='store_true', help='Outputs the full json records of results.') parser.add_argument('--count', '-c', action='store_true', help='Only return the count of the results.') parser.add_argument('--limit', '-l', type=int, help='Ouput at most LIMIT results.') parser.add_argument('--skip', type=int, default=0, help='Skip first SKIP results.') parser.add_argument('--orderby', '-o', help='Order of results ("src", "dst" or "flow")') parser.add_argument('--separator', '-s', help="Separator string.") parser.add_argument('--top', '-t', nargs="+", help='Top flows for a given set of fields, e.g. ' '"--top src.addr dport".') parser.add_argument( '--collect', '-C', nargs="+", help='When using --top, also collect these properties.') parser.add_argument('--sum', '-S', nargs="+", help='When using --top, sum on these properties to ' 'order the result.') parser.add_argument('--mode', '-m', help="Query special mode (flow_map, talk_map...)") parser.add_argument('--timeline', '-T', action="store_true", help='Retrieves the timeline of each flow') parser.add_argument('--flow-daily', action="store_true", help="Flow count per times of the day") parser.add_argument('--plot', action="store_true", help="Plot data when possible (requires matplotlib).") parser.add_argument('--fields', nargs='+', help="Display these fields for each entry.") args = parser.parse_args() out = sys.stdout if args.plot and plt is None: utils.LOGGER.critical("Matplotlib is required for --plot") sys.exit(-1) if args.init: if os.isatty(sys.stdin.fileno()): out.write('This will remove any scan result in your database. ' 'Process ? [y/N] ') ans = input() if ans.lower() != 'y': sys.exit(-1) db.flow.init() sys.exit(0) if args.ensure_indexes: if os.isatty(sys.stdin.fileno()): out.write('This will lock your database. ' 'Process ? [y/N] ') ans = input() if ans.lower() != 'y': sys.exit(-1) db.flow.ensure_indexes() sys.exit(0) filters = { "nodes": args.node_filters or [], "edges": args.flow_filters or [] } query = db.flow.from_filters(filters, limit=args.limit, skip=args.skip, orderby=args.orderby, mode=args.mode, timeline=args.timeline) sep = args.separator or ' | ' coma = ';' if args.separator else '; ' coma2 = ',' if args.separator else ', ' if args.count: count = db.flow.count(query) out.write('%(clients)d clients\n%(servers)d servers\n' '%(flows)d flows\n' % count) elif args.top: top = db.flow.top(query, args.top, args.collect, args.sum) for rec in top: sys.stdout.write( "%s%s%s%s%s\n" % (coma.join(str(elt) for elt in rec["fields"]), sep, rec["count"], sep, coma.join( str(coma2.join(str(val) for val in elt)) for elt in rec["collected"]) if rec["collected"] else "")) elif args.flow_daily: # FIXME? fully in-memory if args.plot: plot_data = {} for rec in db.flow.flow_daily(query): out.write( sep.join([ rec["flow"], rec["time_in_day"].strftime("%T.%f"), str(rec["count"]) ])) out.write("\n") if args.plot: plot_data.setdefault(rec["flow"], [[], []]) plot_data[rec["flow"]][0].append(rec["time_in_day"]) plot_data[rec["flow"]][1].append(rec["count"]) for flow, points in viewitems(plot_data): plt.plot(points[0], points[1], label=flow) plt.legend(loc='best') plt.show() else: fmt = '%%s%s%%s%s%%s' % (sep, sep) node_width = len('XXX.XXX.XXX.XXX') flow_width = len('tcp/XXXXX') for res in db.flow.to_iter(query): if args.json: out.write('%s\n' % res) else: elts = {} for elt in ["src", "flow", "dst"]: elts[elt] = res[elt]['label'] if args.fields: elts[elt] = "%s%s%s" % ( elts[elt], coma, coma.join( str(res[elt]['data'].get(field, "")) for field in args.fields)) src, flow, dst = elts["src"], elts["flow"], elts["dst"] node_width = max(node_width, len(src), len(dst)) flow_width = max(flow_width, len(flow)) if not args.separator: fmt = ('%%-%ds%s%%-%ds%s%%-%ds' % (node_width, sep, flow_width, sep, node_width)) out.write(fmt % (src, flow, dst)) if args.timeline: out.write(sep) out.write( coma.join( str(elt) for elt in sorted(res['flow']['data'] ['meta']['times']))) out.write('\n')
def flatten_contigs(data): for name, windows in viewitems(data): for index, window in enumerate(windows): yield (name, index), dict(window)
def recurrent_net( net, cell_net, inputs, initial_cell_inputs, links, timestep=None, scope=None, outputs_with_grads=(0,), recompute_blobs_on_backward=None, forward_only=False, ): ''' net: the main net operator should be added to cell_net: cell_net which is executed in a recurrent fasion inputs: sequences to be fed into the recurrent net. Currently only one input is supported. It has to be in a format T x N x (D1...Dk) where T is lengths of the sequence. N is a batch size and (D1...Dk) are the rest of dimentions initial_cell_inputs: inputs of the cell_net for the 0 timestamp. Format for each input is: (cell_net_input_name, external_blob_with_data) links: a dictionary from cell_net input names in moment t+1 and output names of moment t. Currently we assume that each output becomes an input for the next timestep. timestep: name of the timestep blob to be used. If not provided "timestep" is used. scope: Internal blobs are going to be scoped in a format <scope_name>/<blob_name> If not provided we generate a scope name automatically outputs_with_grads : position indices of output blobs which will receive error gradient (from outside recurrent network) during backpropagation recompute_blobs_on_backward: specify a list of blobs that will be recomputed for backward pass, and thus need not to be stored for each forward timestep. forward_only: if True, only forward steps are executed ''' assert len(inputs) == 1, "Only one input blob is supported so far" # Validate scoping for einp in cell_net.Proto().external_input: assert einp.startswith(CurrentNameScope()), \ ''' Cell net external inputs are not properly scoped, use AddScopedExternalInputs() when creating them ''' input_blobs = [str(i[0]) for i in inputs] initial_input_blobs = [str(x[1]) for x in initial_cell_inputs] op_name = net.NextName('recurrent') def s(name): # We have to manually scope due to our internal/external blob # relationships. scope_name = op_name if scope is None else scope return "{}/{}".format(str(scope_name), str(name)) # determine inputs that are considered to be references # it is those that are not referred to in inputs or initial_cell_inputs known_inputs = [str(b) for b in input_blobs + initial_input_blobs] known_inputs += [str(x[0]) for x in initial_cell_inputs] if timestep is not None: known_inputs.append(str(timestep)) references = [ core.BlobReference(b) for b in cell_net.Proto().external_input if b not in known_inputs] inner_outputs = list(cell_net.Proto().external_output) # These gradients are expected to be available during the backward pass inner_outputs_map = {o: o + '_grad' for o in inner_outputs} # compute the backward pass of the cell net if not forward_only: backward_ops, backward_mapping = core.GradientRegistry.GetBackwardPass( cell_net.Proto().op, inner_outputs_map) backward_mapping = {str(k): v for k, v in viewitems(backward_mapping)} backward_cell_net = core.Net("RecurrentBackwardStep") del backward_cell_net.Proto().op[:] if recompute_blobs_on_backward is not None: # Insert operators to re-compute the specified blobs. # They are added in the same order as for the forward pass, thus # the order is correct. recompute_blobs_on_backward = {str(b) for b in recompute_blobs_on_backward} for op in cell_net.Proto().op: if not recompute_blobs_on_backward.isdisjoint(set(op.output)): backward_cell_net.Proto().op.extend([op]) # This fires if other outputs than the declared # are computed by the ops that are recomputed assert set(op.output).issubset(recompute_blobs_on_backward) backward_cell_net.Proto().op.extend(backward_ops) # compute blobs used but not defined in the backward pass backward_ssa, backward_blob_versions = core.get_ssa( backward_cell_net.Proto()) undefined = core.get_undefined_blobs(backward_ssa) # also add to the output list the intermediate outputs of fwd_step that # are used by backward. ssa, blob_versions = core.get_ssa(cell_net.Proto()) scratches = [ blob for blob, ver in viewitems(blob_versions) if (ver > 0 and blob in undefined and blob not in cell_net.Proto().external_output) ] backward_cell_net.Proto().external_input.extend(scratches) backward_cell_net.Proto().type = 'simple' else: backward_cell_net = None all_inputs = [i[1] for i in inputs] + [ x[1] for x in initial_cell_inputs] + references all_outputs = [] cell_net.Proto().type = 'rnn' # Internal arguments used by RecurrentNetwork operator # Links are in the format blob_name, recurrent_states, offset. # In the moment t we know that corresponding data block is at # t + offset position in the recurrent_states tensor forward_links = [] backward_links = [] # Aliases are used to expose outputs to external world # Format (internal_blob, external_blob, offset) # Negative offset stands for going from the end, # positive - from the beginning aliases = [] # States held inputs to the cell net recurrent_states = [] for cell_input, _ in initial_cell_inputs: cell_input = str(cell_input) # Recurrent_states is going to be (T + 1) x ... # It stores all inputs and outputs of the cell net over time. # Or their gradients in the case of the backward pass. state = s(cell_input + "_states") states_grad = state + "_grad" cell_output = links[str(cell_input)] forward_links.append((cell_input, state, 0)) forward_links.append((cell_output, state, 1)) aliases.append((state, cell_output + "_all", 1)) aliases.append((state, cell_output + "_last", -1)) all_outputs.extend([cell_output + "_all", cell_output + "_last"]) recurrent_states.append(state) if backward_cell_net is not None: backward_links.append((cell_output + "_grad", states_grad, 1)) backward_cell_net.Proto().external_input.append( str(cell_output) + "_grad") recurrent_input_grad = cell_input + "_grad" if not backward_blob_versions.get(recurrent_input_grad, 0): # If nobody writes to this recurrent input gradient, we need # to make sure it gets to the states grad blob after all. # We do this by using backward_links which triggers an alias # This logic is being used for example in a SumOp case backward_links.append( (backward_mapping[cell_input], states_grad, 0)) else: backward_links.append((cell_input + "_grad", states_grad, 0)) for input_t, input_blob in inputs: forward_links.append((str(input_t), str(input_blob), 0)) if backward_cell_net is not None: for input_t, input_blob in inputs: backward_links.append(( backward_mapping[str(input_t)], str(input_blob) + "_grad", 0 )) backward_cell_net.Proto().external_input.extend( cell_net.Proto().external_input) backward_cell_net.Proto().external_input.extend( cell_net.Proto().external_output) def unpack_triple(x): if x: a, b, c = zip(*x) return a, b, c return [], [], [] # Splitting to separate lists so we can pass them to c++ # where we ensemle them back link_internal, link_external, link_offset = unpack_triple(forward_links) alias_src, alias_dst, alias_offset = unpack_triple(aliases) recurrent_inputs = [str(x[1]) for x in initial_cell_inputs] # Make sure that recurrent gradients accumulate with internal gradients # (if a blob in the backward_cell_net receives gradient from both an # external connection as well as from within the backward_cell_net, # those gradients need to be added together, rather than one overwriting # the other) if backward_cell_net is not None: proto = backward_cell_net.Proto() operators = [] while len(proto.op) > 0: op = proto.op[-1] proto.op.remove(op) operators.append(op) for op in operators[::-1]: proto.op.extend([op]) for j, output_blob in enumerate(op.output): if output_blob in proto.external_input: # In place operation won't cause issues because it takes # existing value of a blob into account if output_blob in op.input: continue output_blob = core.BlobReference(output_blob) accum_blob = output_blob + "_accum" proto.op[-1].output[j] = str(accum_blob) backward_cell_net.Sum( [output_blob, accum_blob], [output_blob], ) backward_args = {} backward_mapping_keys = set(viewkeys(backward_mapping)) if backward_cell_net is not None: backward_link_internal, backward_link_external, backward_link_offset = \ unpack_triple(backward_links) params = [x for x in references if x in backward_mapping_keys] param_grads = [ str(backward_mapping[x]) for x in references if x in backward_mapping_keys ] if recompute_blobs_on_backward is None: recompute_blobs_on_backward = set() backward_args = { 'param': [all_inputs.index(p) for p in params], 'backward_link_internal': [str(l) for l in backward_link_internal], 'backward_link_external': [str(l) for l in backward_link_external], 'backward_link_offset': backward_link_offset, 'backward_step_net': str(backward_cell_net.Proto()), 'outputs_with_grads': outputs_with_grads, 'recompute_blobs_on_backward': [ str(b) for b in recompute_blobs_on_backward ], 'param_grads': param_grads, } results = net.RecurrentNetwork( all_inputs, all_outputs + [s("step_workspaces")], alias_src=alias_src, alias_dst=[str(a) for a in alias_dst], alias_offset=alias_offset, recurrent_states=recurrent_states, initial_recurrent_state_ids=[ all_inputs.index(i) for i in recurrent_inputs ], link_internal=[str(l) for l in link_internal], link_external=[str(l) for l in link_external], link_offset=link_offset, step_net=str(cell_net.Proto()), timestep="timestep" if timestep is None else str(timestep), **backward_args ) # Restore net type since 'rnn' is not recognized outside RNNs cell_net.Proto().type = 'simple' # The last output is a list of step workspaces, # which is only needed internally for gradient propogation return results[:-1]
def update(self): try: with llfuse.lock_released: self._updating_lock.acquire() if not self.stale(): return contents = {} roots = [] root_owners = set() objects = {} methods = self.api._rootDesc.get( 'resources')["groups"]['methods'] if 'httpMethod' in methods.get('shared', {}): page = [] while True: resp = self.api.groups().shared( filters=[['group_class', '=', 'project']] + page, order="uuid", limit=10000, count="none", include="owner_uuid").execute() if not resp["items"]: break page = [[ "uuid", ">", resp["items"][len(resp["items"]) - 1]["uuid"] ]] for r in resp["items"]: objects[r["uuid"]] = r roots.append(r["uuid"]) for r in resp["included"]: objects[r["uuid"]] = r root_owners.add(r["uuid"]) else: all_projects = arvados.util.list_all( self.api.groups().list, self.num_retries, filters=[['group_class', '=', 'project']], select=["uuid", "owner_uuid"]) for ob in all_projects: objects[ob['uuid']] = ob current_uuid = self.current_user['uuid'] for ob in all_projects: if ob['owner_uuid'] != current_uuid and ob[ 'owner_uuid'] not in objects: roots.append(ob['uuid']) root_owners.add(ob['owner_uuid']) lusers = arvados.util.list_all( self.api.users().list, self.num_retries, filters=[['uuid', 'in', list(root_owners)]]) lgroups = arvados.util.list_all( self.api.groups().list, self.num_retries, filters=[['uuid', 'in', list(root_owners) + roots]]) for l in lusers: objects[l["uuid"]] = l for l in lgroups: objects[l["uuid"]] = l for r in root_owners: if r in objects: obr = objects[r] if obr.get("name"): contents[obr["name"]] = obr #elif obr.get("username"): # contents[obr["username"]] = obr elif "first_name" in obr: contents[u"{} {}".format(obr["first_name"], obr["last_name"])] = obr for r in roots: if r in objects: obr = objects[r] if obr['owner_uuid'] not in objects: contents[obr["name"]] = obr # end with llfuse.lock_released, re-acquire lock self.merge( viewitems(contents), lambda i: i[0], lambda a, i: a.uuid() == i[1]['uuid'], lambda i: ProjectDirectory(self.inode, self.inodes, self.api, self.num_retries, i[1], poll=self._poll, poll_time=self._poll_time)) except Exception: _logger.exception("arv-mount shared dir error") finally: self._updating_lock.release()