예제 #1
0
    def build_vocabularies(self, rows: RDD):
        """
        Process rows to gather values and paths with their frequencies.
        :param rows: row structure is ((key, doc), val) where:
            * key: str with the path context
            * doc: file name
            * val: number of occurrences of key in doc
        """

        def _flatten_row(row: Row):
            # 2: removes the namespace v. from the string to parse it as tuple
            k = Vocabulary2Id._unstringify_path_context(row)
            return [(k[0], 1), (k[1], 1), (k[2], 1)]

        rows = rows \
            .flatMap(_flatten_row) \
            .reduceByKey(operator.add) \
            .persist()

        values = rows.filter(lambda x: type(x[0]) == str).collect()
        paths = rows.filter(lambda x: type(x[0]) == tuple).collect()

        value2index = {w: id for id, (w, _) in enumerate(values)}
        path2index = {w: id for id, (w, _) in enumerate(paths)}
        value2freq = {w: freq for _, (w, freq) in enumerate(values)}
        path2freq = {w: freq for _, (w, freq) in enumerate(paths)}

        rows.unpersist()

        return value2index, path2index, value2freq, path2freq
예제 #2
0
def analyze(rddDns: RDD) -> Dict[str, Result]:
  # filter out trustedDNS
  log = getLogger()
  premiseCheck_ = functools.partial(premiseCheck, 
                                    Global.ALLOWED_NAME_LEN, 
                                    Global.RESTRICTED_SYMS, 
                                    Global.MAX_BODY_SIZE,
                                    Global.MIN_TTL)
  
  timer = Timer()
  # cache bcs only this rdd will be used in the application
  ipPartGen = rddDns.filter(compose(operator.not_, premiseCheck_)).map(lambda dns: str(dns.sip)).distinct().glom().toLocalIterator()
  log.info(f'Time spent on premis analysis = {timer.elapsed()}')
  # log.debug(ips)

  timer = Timer()
  ipdoms = {}
  # REFACTOR THIS STIH

    
  for ipPart in ipPartGen:
    for ip in set(ipPart):
      if ip not in ipdoms:
        log.debug(ip)
        ipdoms[ip] = np.array(
          rddDns.filter(
            lambda dns: ip in [dns.dip, dns.sip]).map(
              lambda dns: parseDomain(str(dns.getName()))).distinct().collect())
        log.debug(ipdoms.get(ip))
      
  log.info(f'Time spent on searching packets for chosen IPs = {timer.elapsed()}')

  timer = Timer()
  result = []
  for ip, doms in ipdoms.items():
    result.append((str(ip), repr(unigramAnalysis(doms))))
  log.info(f'Time spent on unigram distribution analysis = {timer.elapsed()}')

  rddDns.unpersist()
  
  return dict(result)