def build_vocabularies(self, rows: RDD): """ Process rows to gather values and paths with their frequencies. :param rows: row structure is ((key, doc), val) where: * key: str with the path context * doc: file name * val: number of occurrences of key in doc """ def _flatten_row(row: Row): # 2: removes the namespace v. from the string to parse it as tuple k = Vocabulary2Id._unstringify_path_context(row) return [(k[0], 1), (k[1], 1), (k[2], 1)] rows = rows \ .flatMap(_flatten_row) \ .reduceByKey(operator.add) \ .persist() values = rows.filter(lambda x: type(x[0]) == str).collect() paths = rows.filter(lambda x: type(x[0]) == tuple).collect() value2index = {w: id for id, (w, _) in enumerate(values)} path2index = {w: id for id, (w, _) in enumerate(paths)} value2freq = {w: freq for _, (w, freq) in enumerate(values)} path2freq = {w: freq for _, (w, freq) in enumerate(paths)} rows.unpersist() return value2index, path2index, value2freq, path2freq
def clean_claims(claims: RDD, b_item_map: Broadcast): def clean(claim): item_map = b_item_map.value if claim.datatype == 'wikibase-item': if claim.object in item_map: claim = claim._replace(object=item_map[claim.object]) return claim else: return None elif claim.datatype == 'quantity': unit = claim.object.unit unit = unit.split('/')[-1] if unit in item_map: claim = claim._replace(object=item_map[unit]) return claim else: return None return claim dt_filter = { 'wikibase-item', 'string', 'monolingualtext', 'quantity', 'time' } return claims.filter(lambda c: c.datatype in dt_filter).map(clean).filter( lambda c: c is not None)
def clean_claims(claims: RDD, b_item_map: Broadcast): def clean(claim): item_map = b_item_map.value if claim.datatype == "wikibase-item": if claim.object in item_map: claim = claim._replace(object=item_map[claim.object]) return claim else: return None elif claim.datatype == "quantity": unit = claim.object.unit unit = unit.split("/")[-1] if unit in item_map: claim = claim._replace(object=item_map[unit]) return claim else: return None return claim dt_filter = { "wikibase-item", "string", "monolingualtext", "quantity", "time" } return (claims.filter(lambda c: c.datatype in dt_filter).map(clean).filter( lambda c: c is not None))
def extract_property_map(parsed_wikidata: RDD): def parse_property(prop): label = prop["labels"]["en"]["value"] return prop["id"], label return (parsed_wikidata.filter(lambda d: d["type"] == "property").map( parse_property).collectAsMap())
def analyze(rddDns: RDD) -> Dict[str, Result]: # filter out trustedDNS log = getLogger() premiseCheck_ = functools.partial(premiseCheck, Global.ALLOWED_NAME_LEN, Global.RESTRICTED_SYMS, Global.MAX_BODY_SIZE, Global.MIN_TTL) timer = Timer() # cache bcs only this rdd will be used in the application ipPartGen = rddDns.filter(compose(operator.not_, premiseCheck_)).map(lambda dns: str(dns.sip)).distinct().glom().toLocalIterator() log.info(f'Time spent on premis analysis = {timer.elapsed()}') # log.debug(ips) timer = Timer() ipdoms = {} # REFACTOR THIS STIH for ipPart in ipPartGen: for ip in set(ipPart): if ip not in ipdoms: log.debug(ip) ipdoms[ip] = np.array( rddDns.filter( lambda dns: ip in [dns.dip, dns.sip]).map( lambda dns: parseDomain(str(dns.getName()))).distinct().collect()) log.debug(ipdoms.get(ip)) log.info(f'Time spent on searching packets for chosen IPs = {timer.elapsed()}') timer = Timer() result = [] for ip, doms in ipdoms.items(): result.append((str(ip), repr(unigramAnalysis(doms)))) log.info(f'Time spent on unigram distribution analysis = {timer.elapsed()}') rddDns.unpersist() return dict(result)
def compute_fdr_and_filter_results( moldb: MolecularDB, fdr: FDR, ion_formula_map_df: pd.DataFrame, formula_metrics_df: pd.DataFrame, formula_images_rdd: pyspark.RDD, scoring_model: Optional[ScoringModel], ) -> Tuple[pd.DataFrame, pyspark.RDD, FdrDiagnosticBundle]: """Compute FDR for database annotations and filter them.""" moldb_formula_map_df = ion_formula_map_df[ion_formula_map_df.moldb_id == moldb.id].drop('moldb_id', axis=1) moldb_metrics_fdr_df = compute_fdr(fdr, formula_metrics_df, moldb_formula_map_df, scoring_model) if not moldb.targeted: max_fdr = 0.5 moldb_metrics_fdr_df = moldb_metrics_fdr_df[ moldb_metrics_fdr_df.fdr <= max_fdr] else: # fdr is not null for target ion formulas moldb_metrics_fdr_df = moldb_metrics_fdr_df[~moldb_metrics_fdr_df.fdr. isnull()] moldb_ion_images_rdd = formula_images_rdd.filter( lambda kv: kv[0] in moldb_metrics_fdr_df.index # pylint: disable=cell-var-from-loop ) moldb_ion_metrics_df = moldb_metrics_fdr_df.merge(fdr.target_modifiers_df, left_on='modifier', right_index=True) # Extract the metrics for just this database, avoiding duplicates and handling missing rows all_metrics_df = formula_metrics_df.merge( moldb_formula_map_df.index.rename( 'formula_i').drop_duplicates().to_frame(index=True)[[]], left_index=True, right_index=True, how='inner', ) formula_map_df = (moldb_formula_map_df.drop( columns=['ion_formula']).rename_axis(index='formula_i').reset_index()) fdr_bundle = FdrDiagnosticBundle( decoy_sample_size=fdr.decoy_sample_size, decoy_map_df=fdr.td_df, formula_map_df=formula_map_df, metrics_df=all_metrics_df, ) return moldb_ion_metrics_df, moldb_ion_images_rdd, fdr_bundle
def filter_big_time_step(rdd: RDD, min_step: float, max_step: float) -> RDD: """ Some plane sometimes pass out for a long time (1k sec and more) without moving. We remove those planes from the records (SHALL WE ???) """ def map_get_max_time_step(record): time = record.Time max_step = 0 min_step = 0 if len(time) > 1: arr = [t2 - t1 for t1, t2 in zip(time[:-1], time[1:])] max_step = np.max(arr) min_step = np.min(arr) return record, min_step, max_step rdd = rdd.map(map_get_max_time_step) rdd = rdd.filter(lambda r: r[1] >= min_step * 1000 and r[2] <= max_step * 1000) \ .map(lambda r: r[0]) # remove from the record max and min steps # print(f"Applied filter on time steps of size {(min_step, max_step)}, remains {rdd.count()} records") return rdd pass
def clean_claims(claims: RDD, b_item_map: Broadcast): def clean(claim): item_map = b_item_map.value if claim.datatype == 'wikibase-item': if claim.object in item_map: claim = claim._replace(object=item_map[claim.object]) return claim else: return None elif claim.datatype == 'quantity': unit = claim.object.unit unit = unit.split('/')[-1] if unit in item_map: claim = claim._replace(object=item_map[unit]) return claim else: return None return claim dt_filter = {'wikibase-item', 'string', 'monolingualtext', 'quantity', 'time'} return claims.filter(lambda c: c.datatype in dt_filter).map(clean).filter(lambda c: c is not None)
def normal_order(self, terms: RDD, **kwargs): """Normal order the terms according to generalized Wick theorem. The actual expansion is based on the information given in the subclasses by the abstract properties. """ comparator = kwargs.pop('comparator', self.comparator) contractor = kwargs.pop('contractor', self.contractor) if len(kwargs) != 0: raise ValueError('Invalid arguments to Wick normal order', kwargs) phase = self.phase symms = self.symms resolvers = self.resolvers terms.cache() terms_to_proc = terms.filter(lambda x: len(x.vecs) > 1) keep_top = 0 if comparator is None else 1 terms_to_keep = terms.filter(lambda x: len(x.vecs) <= keep_top) terms_to_proc.cache() if terms_to_proc.count() == 0: return terms_to_keep # Triples: term, contractions, schemes. wick_terms = terms_to_proc.map(lambda x: _prepare_wick( x, comparator, contractor, symms.value, resolvers.value)) if self._wick_parallel == 0: normal_ordered = wick_terms.flatMap(lambda x: [ _form_term_from_wick(x[0], x[1], phase, resolvers.value, i) for i in x[2] ]) elif self._wick_parallel == 1: flattened = wick_terms.flatMap( lambda x: [(x[0], x[1], i) for i in x[2]]) if self._num_partitions is not None: flattened = flattened.repartition(self._num_partitions) normal_ordered = flattened.map(lambda x: _form_term_from_wick( x[0], x[1], phase, resolvers.value, x[2])) elif self._wick_parallel == 2: # This level of parallelism is reserved for really hard problems. expanded = [] for term, contrs, schemes in wick_terms.collect(): # To work around a probable Spark bug. Problem occurs when we # have closures inside a loop to be distributed out. form_term = functools.partial(_form_term_from_wick_bcast, term, contrs, phase, resolvers) curr = self._ctx.parallelize(schemes).map(form_term) expanded.append(curr) continue normal_ordered = self._ctx.union(expanded) else: raise ValueError('Invalid Wick expansion parallel level', self._wick_parallel) return terms_to_keep.union(normal_ordered)
def remove_header(rdd: RDD) -> RDD: header_rdd: RDD = rdd.first() return rdd.filter(lambda row: row != header_rdd)