예제 #1
0
    def request(url, tries=3):
        """Wrapper around :func:`urlopen` to AWIS call.

        On failure, will attempt another 2 tries for success.

        **Args:**
            *url*: the AWIS URL to call

            *tries*: number of failed tries allowed before flagging this
            attempt as a failure

        **Returns:**
            the HTTP response value

        """
        failed_requests = 0
        response_value = None
        while failed_requests < tries:
            try:
                log.debug('Request %d of %d: "%s"', (failed_requests + 1),
                          tries, url)
                response = urlopen(url)
                if response.code == 200:
                    response_value = response.read()
                    break
            except HTTPError as err:
                log.error('Request failed "%s"', err)

            failed_requests += 1
            if failed_requests >= tries:
                log.error('All requests failed')

        return response_value
예제 #2
0
    def flatten_domains(self,
                        max_read_count=None,
                        topic='alexa-results',
                        group_id='default',
                        dry=False):
        """Takes the Alexa batched domain results and
        split out into separate, JSON equivalents.

        Kwargs:
            *max_read_count*: number of batched domains to read.  `None`
            returned all offsets associated with the *group_id*

            *group_id*: Kafka managed consumer element that manages
            the messages read from the topic

        """
        count_q = multiprocessing.Queue()

        target = self.flatten_worker
        args = (count_q, max_read_count, topic, group_id)
        kwargs = {'dry': dry}
        domain_intel.utils.threader(self.threads, target, *args, **kwargs)

        total_count = 0
        while not count_q.empty():
            total_count += count_q.get()

        log.debug('Flatten workers total records read %d', total_count)

        return total_count
예제 #3
0
파일: stages.py 프로젝트: loum/domain-intel
    def persist(self):
        """Persist flattened (processed) GeoDNS data to ArangoDB.

        :attr:`max_read_count` can limit the number of records read from
        *topic*.  The default action is to read all available messages.

        The default consumer :attr:`topics` is `dns-geodns-parsed`.

        The :attr:`dry` flag will simulate execution.  No records will be
        published.

        Returns:
            total count of records written to the DB across all workers

        """
        count_q = multiprocessing.Queue()

        target = self.persist_worker
        args = (count_q, self.max_read_count, self.kafka_consumer_topics[0],
                self.kafka_consumer_group_id)
        kwargs = {'dry': self.dry}
        threads = domain_intel.common.CONFIG.get('threads', 1)
        domain_intel.utils.threader(threads, target, *args, **kwargs)

        total_count = 0
        while not count_q.empty():
            total_count += count_q.get()

        log.debug('Persisted GeoDNS total count %d', total_count)

        return total_count
예제 #4
0
    def resolve(self, ipv4, time_epoch=None):
        """resolve an ipv4 address and optional point in time to a geog record (see ipechelon.geog* in aurora.
        for in depth documentation, look at the compass project.
        
        this method does not have an 'unparsed' representation, as we control both ends."""
        url = self.url()
        log.debug(
            "compass requesting %s",
            ipv4,
        )
        try:
            res = self.session.post(
                url,
                data=json.dumps({
                    "ip":
                    ipv4,
                    "time":
                    time_epoch if time_epoch else int(time.time()),
                }))

            response = self._parse_results(res.content)
            res.raise_for_status()
            return response

        # bubble this up to caller
        except CompassServerEmptyResponse as exc:
            raise exc
        # wrap generic fatal error
        except Exception as exc:
            raise_from(
                CompassServerError("couldn't call geog compass backend: %s" %
                                   (res.content)), exc)
예제 #5
0
 def _get_or_error(self, url, headers):
     try:
         log.debug("requesting url %s" % url)
         res = self.session.get(url, headers=headers)
         res.raise_for_status()
         return res
     except Exception as exc:
         raise_from(CheckHostNetError("couldn't call check backend"), exc)
예제 #6
0
    def read_worker(self,
                    queue,
                    max_read_count,
                    topic,
                    group_id,
                    slurp,
                    dry=False):
        """Read all domains from the Kafka partitions.

        As this is a worker that could be part of a set of executing
        threads, the number of messages read is pushed onto the
        :class:`multiprocessing.Queue` *queue*.

        The parameter list is as per :meth:`read_domains`.

        Returns:
            updated :class:`multiprocessing.Queue` *queue* instance
            with number of records processed

        """
        log.debug('Read worker set to read %s messages', max_read_count
                  or 'all')

        with self.producer() as producer:
            with self.consumer(topic, group_id) as consumer:
                domain_batch = []
                messages_read = 0
                total_messages_read = 0

                for message in consumer:
                    messages_read += 1
                    total_messages_read += 1

                    domain_batch.append(message.value.rstrip())
                    if (len(domain_batch) > 4
                            or (max_read_count is not None
                                and messages_read >= max_read_count)):
                        if slurp:
                            self.slurp_domains(producer, domain_batch, dry=dry)
                        else:
                            log.info('Domains pending: %s', domain_batch)
                        del domain_batch[:]
                        messages_read = 0

                    if (max_read_count is not None
                            and (total_messages_read >= max_read_count)):
                        break

                # ... and check for laggards.
                if domain_batch:
                    log.info('Processing laggards before close')
                    if slurp:
                        self.slurp_domains(producer, domain_batch, dry=dry)
                    else:
                        log.info('Domains pending: %s', domain_batch)

        queue.put(total_messages_read)
예제 #7
0
    def get_geodns(self):
        """Get all the GeoDNS data.
        """
        all_geodns = []

        for graph_path in self.paths:
            edge = graph_path.get('edges', [])
            vertices = graph_path.get('vertices')

            for ipv in ['ipv4']:
                if edge and '{}_resolves'.format(ipv) in edge[0].get('_id'):
                    for vertice in vertices:
                        log.debug('XXX %s', vertices)
                        if vertice and not re.match('{}/'.format(ipv),
                                                    vertice.get('_id')):
                            continue

                        ip_addr = vertice.get('_key')
                        dns_org = vertice.get('organisation',
                                              {}).get('name', '')
                        isp = vertice.get('isp', {}).get('name', '')
                        lat = vertice.get('geospatial', {}).get('latitude', '')
                        lng = vertice.get('geospatial',
                                          {}).get('longitude', '')
                        country_code = vertice.get('country', {}).get(
                            'iso3166_code_2', '')
                        country_name = vertice.get('country',
                                                   {}).get('name', '')
                        continent_code = vertice.get('continent',
                                                     {}).get('code', '')
                        continent_name = vertice.get('continent',
                                                     {}).get('name', '')

                        if dns_org:
                            dns_org = '"{}"'.format(dns_org)

                        if isp:
                            isp = '"{}"'.format(isp)

                        token = ipv.upper()
                        kwargs = {
                            '{}_ADDR'.format(token): ip_addr,
                            '{}_ORG'.format(token): dns_org,
                            '{}_ISP'.format(token): isp,
                            '{}_LATITUDE'.format(token): lat,
                            '{}_LONGITUDE'.format(token): lng,
                            '{}_COUNTRY_CODE'.format(token): country_code,
                            '{}_COUNTRY'.format(token): country_name,
                            '{}_CONTINENT_CODE'.format(token): continent_code,
                            '{}_CONTINENT'.format(token): continent_name,
                        }
                        all_geodns.append(kwargs)

        return all_geodns
예제 #8
0
    def slurp_sites_linking_in(self,
                               domain,
                               max_slurps=None,
                               as_json=False,
                               dry=False):
        """Get list of sites linking into *domain*.

        Alexa places an upper limit of 20 on the number of sites that it
        will return per request (or a "slurp".  Subsequent calls must be
        made by incrementing the `Start` request parameter to indicate the
        page to return.  Since there is not way to know how many pages
        need to be slurped, we must test the current result for a list of
        titles.  If no titles are returned or *max_slurps* is breached
        (which ever comes first) then we exit.

        Returns:
            list of titles slurped.  If *as_json* is set then the resultant
            set is returned as a JSON structure

        """
        if max_slurps is None:
            max_slurps = MAX_SLURPS

        all_titles = []
        for start_index in range(max_slurps):
            if start_index >= max_slurps:
                log.debug('SitesLinkingIn domain "%s" threshold breached',
                          domain)
                break
            log.debug('SitesLinkingIn domain "%s" slurp iteration %d of %d',
                      domain, start_index + 1, max_slurps)

            response = None
            if not dry:
                response = self.api.sites_linking_in(domain,
                                                     start_index*SLI_COUNT)
            parser = domain_intel.awisapi.parser.SitesLinkingIn(response)
            titles = parser.extract_titles()

            if titles:
                all_titles.extend(titles)
            else:
                log.info('SitesLinkingIn slurp iteration %d returned '
                         'zero titles: exiting', start_index + 1)
                break

        unique_titles = SitesLinkingIn.unique_titles(all_titles)
        if as_json:
            unique_titles = json.dumps(unique_titles,
                                       sort_keys=True,
                                       indent=4)

        return unique_titles
예제 #9
0
    def flatten_worker(self,
                       queue,
                       max_read_count,
                       topic,
                       group_id,
                       dry=False):
        """Read all Alexa TrafficHistory results from the Kafka *topic*.

        As this is a worker that could be part of a set of executing
        threads, the number of messages read is pushed onto the
        :class:`multiprocessing.Queue` *queue*.

        The parameter list is as per :meth:`flatten_worker`.

        Returns:
            updated :class:`multiprocessing.Queue` *queue* instance
            with number of records processed

        """
        log.debug('TrafficHistory flatten worker set to read %s messages',
                  max_read_count or 'all')
        log.debug('TrafficHistory flatten worker timeout set to %d',
                  self.timeout)

        total_messages_read = 0
        total_messages_put = 0

        with self.producer() as producer:
            with self.consumer(topic, group_id=group_id) as consumer:
                records_read = 0
                for message in consumer:
                    total_messages_read += 1
                    traffic = TrafficHistory.flatten_xml(message.value)
                    if traffic is None:
                        continue

                    if not dry:
                        total_messages_put += 1
                        producer.send('alexa-traffic-flattened',
                                      traffic.encode('utf-8'))

                    if (max_read_count is not None
                            and (records_read >= max_read_count)):
                        break

        log.info('TrafficHistory flatten worker read|put count %d|%d',
                 total_messages_read, total_messages_put)

        queue.put(tuple([total_messages_read, total_messages_put]))
예제 #10
0
파일: utils.py 프로젝트: loum/domain-intel
def info(**kwargs):
    """Simple dump to logs/stout of information related to the topics
    we are currently authorised to access.

    """
    log.info('Attempting get of Kafka topic detail information ...')
    with safe_consumer(None, **kwargs) as consumer:
        topics = consumer.topics()
        topic_count = len(topics)
        for index, topic in enumerate(topics, 1):
            log.debug('Authorised topic %d of %d: %s',
                      index, topic_count, topic)
            partitions = [str(x) for x in consumer.partitions_for_topic(topic)]
            log.info('- Partitions: %s', ', '.join(partitions))

    return topics
예제 #11
0
    def traverse_relationship(self,
                              max_read_count=None,
                              topic='domain-labels',
                              group_id='default',
                              dry=False):
        """Read domain labels from the Kafka topic *topic*
        and uses that that the starting vertex to traverse the graph.
        The hardwired Kafka topic read from is `domain-labels`.

        If *max_read_count* is `None` then all domains will be
        returned.

        The default Kafka *group_id* name used is `default`.  However,
        we can force a re-read of the topic's messages by overriding
        *group_id* with a unique value.

        Returns:
            total count of records read

        """
        log.debug('Traverse worker set to read %s messages', max_read_count
                  or 'all')

        with self.producer() as producer:
            with self.consumer(topic, group_id) as consumer:
                total_messages_read = 0

                for message in consumer:
                    label = message.value.decode('utf-8')
                    result = self.store.traverse_graph(label)
                    if result is None:
                        continue

                    total_messages_read += 1
                    if not dry:
                        producer.send('domain-traversals',
                                      result.encode('utf-8'))

                    if (max_read_count is not None
                            and (total_messages_read >= max_read_count)):
                        break

        log.debug('Domains traverser worker records read %d',
                  total_messages_read)

        return total_messages_read
예제 #12
0
    def persist_worker(self,
                       queue,
                       max_read_count,
                       topic,
                       group_id,
                       dry=False):
        """Persist flattened (processed) Alexa domain data to ArangoDB
        worker.

        As this is a worker that could be part of a set of executing
        threads, the number of messages read is pushed onto the
        :class:`multiprocessing.Queue` *queue*.

        The parameter list is as per :meth:`persist`.

        Returns:
            updated :class:`multiprocessing.Queue` *queue* instance
            with number of records processed

        """
        log.debug('Data persist worker set to read %s messages', max_read_count
                  or 'all')
        log.debug('Persist worker timeout set to %d', self.timeout)

        total_messages_read = 0
        put_count = 0

        with self.consumer(topic, group_id) as consumer:
            for message in consumer:
                total_messages_read += 1

                self.write_to_store(message.value, dry)
                # TODO: quantify successful insert.
                put_count += 1

                if (max_read_count is not None
                        and total_messages_read >= max_read_count):
                    log.info('Maximum read threshold %d breached - exiting',
                             max_read_count)
                    break

        log.info('UrlInfo persist worker messages read %d',
                 total_messages_read)

        queue.put((total_messages_read, put_count))
예제 #13
0
    def wide_column_dump(self,
                         max_read_count=None,
                         topic='domain-traversals',
                         group_id='default',
                         dry=False):
        """Takes Domain Intel graph data and dumps to a wide-column CSV
        format suitable for ingest into Google BigQuery

        *max_read_count* can limit the number of records read from *topic*.
        The default action is to read all available messages.

        The default consumer *topic* is `domain-traversal`.

        The *dry* flag will simulate execution.  No records will be
        published.

        Returns:
            tuple structure representing counts for the total number of
            records consumed and the number of domains successfully
            published to the Kafka topic

        """
        count_q = multiprocessing.Queue()

        target = self.wide_column_dump_worker
        args = (count_q, max_read_count, topic, group_id)
        kwargs = {'dry': dry}
        domain_intel.utils.threader(self.threads, target, *args, **kwargs)

        total_read_count = 0
        total_put_count = 0
        while not count_q.empty():
            counter = count_q.get()
            total_read_count += counter[0]
            total_put_count += counter[1]

        log.debug('Wide-column CSV dump read|put count %d|%d',
                  total_read_count, total_put_count)
        read_put_counts = (total_read_count, total_put_count)

        return read_put_counts
예제 #14
0
    def persist(self,
                max_read_count=None,
                topic='alexa-traffic-flattened',
                group_id='default',
                dry=False):
        """Takes Alexa TrafficHistory records and writes to the persistent
        store.

        *max_read_count* can limit the number of records read from *topic*.
        The default action is to read all available messages.

        The default consumer *topic* is ``alexa-traffic-flattened``.

        The *dry* flag will simulate execution.  No records will be
        published.

        Returns:
            tuple structure representing counts for the total number of
            records consumed and the number of domains successfully
            published to the Kafka topic

        """
        count_q = multiprocessing.Queue()

        target = self.persist_worker
        args = (count_q, max_read_count, topic, group_id)
        kwargs = {'dry': dry}
        domain_intel.utils.threader(self.threads, target, *args, **kwargs)

        total_read_count = 0
        total_put_count = 0
        while not count_q.empty():
            counter = count_q.get()
            total_read_count += counter[0]
            total_put_count += counter[1]

        log.debug('TrafficHistory persist worker read|put count %d|%d',
                  total_read_count, total_put_count)
        read_put_counts = (total_read_count, total_put_count)

        return read_put_counts
예제 #15
0
    def traverse_graph(self, label, as_json=True):
        """Traverse the :attr:`graph` starting at vertex denoted by
        *label*.

        Returns:
            the graph structure as a dictionary optionally converted
            to JSON if *as_json* is set

        """
        log.debug('Traversing label "%s"', label)

        result = None
        try:
            result = self.graph.traverse(label, direction='any', max_depth=1)
        except arango.exceptions.GraphTraverseError as err:
            log.error('Label "%s" traverse error: %s', label, err)

        if result is not None and as_json:
            result = json.dumps(result)

        return result
예제 #16
0
    def persist(self,
                max_read_count=None,
                topic='alexa-flattened',
                group_id='default',
                dry=False):
        """Persist flattened (processed) Alexa domain data to ArangoDB
        executor.

        *max_read_count* can limit the number of records read from *topic*.
        The default action is to read all available messages.

        The default consumer *topic* is `alexa-flattened`.

        The *dry* flag will simulate execution.  No records will be
        published.

        Returns:
            total count of records written to the DB across all workers

        """
        count_q = multiprocessing.Queue()

        target = self.persist_worker
        args = (count_q, max_read_count, topic, group_id)
        kwargs = {'dry': dry}
        domain_intel.utils.threader(self.threads, target, *args, **kwargs)

        total_read_count = 0
        total_put_count = 0
        while not count_q.empty():
            counter = count_q.get()
            total_read_count += counter[0]
            total_put_count += counter[1]

        log.debug('UrlInfo persist worker read|put count %d|%d',
                  total_read_count, total_put_count)
        read_put_counts = (total_read_count, total_put_count)

        return read_put_counts
예제 #17
0
    def topic_dump(self,
                   max_read_count=None,
                   topic='wide-column-csv',
                   group_id='default',
                   dry=False):
        """Simple dump of messages from *topic*.

        *max_read_count* can limit the number of records read from *topic*.
        The default action is to read all available messages.

        The default Kafka *group_id* name used is `default`.  However,
        we can force a re-read of the topic's messages by overriding
        *group_id* with a unique value.

        The *dry* flag will simulate execution.  No output CSV will be
        created.

        Returns:
            number of messages read

        """
        log.debug('Topic "%s" dump set to read %s messages',
                  topic, max_read_count or 'all')
        log.debug('Topic dump timeout set to %d', self.timeout)

        with self.consumer(topic, group_id) as consumer:
            messages_read = 0
            for message in consumer:
                messages_read += 1
                sys.stdout.buffer.write(message.value)
                print()

                if (max_read_count is not None and
                        messages_read >= max_read_count):
                    log.info('Maximum read threshold %d breached - exiting',
                             max_read_count)
                    break

        return messages_read
예제 #18
0
    def wide_column_dump_worker(self, queue, max_read_count, topic, group_id,
                                dry):
        """Wide-column CSV dump worker.

        As this is a worker that could be part of a set of executing
        threads, the number of messages read is pushed onto the
        :class:`multiprocessing.Queue` *queue*.

        The parameter list is as per :meth:`wide_column_dump`.

        Returns:
            updated :class:`multiprocessing.Queue` *queue* instance
            with number of records processed

        """
        log.debug('Wide-column CSV dump worker set to read %s messages',
                  max_read_count or 'all')

        with self.producer() as producer:
            with self.consumer(topic, group_id=group_id) as consumer:
                total_messages_read = 0
                total_messages_put = 0
                for message in consumer:
                    traversal = json.loads(message.value.decode('utf-8'))
                    reporter = domain_intel.Reporter(data=traversal)
                    total_messages_read += 1
                    for line in reporter.dump_wide_column_csv():
                        if not dry:
                            producer.send('wide-column-csv',
                                          line.encode('utf-8'))
                        total_messages_put += 1

                    if (max_read_count is not None
                            and (total_messages_read >= max_read_count)):
                        break

        queue.put((total_messages_read, total_messages_put))
예제 #19
0
    def persist(self,
                max_read_count=None,
                topic='analyst-qas',
                group_id='default',
                dry=False):
        """Takes Analyst QA records and writes to the persistent store.

        *max_read_count* can limit the number of records read from *topic*.
        The default action is to read all available messages.

        The default consumer *topic* is ``analyst-qas``.

        The *dry* flag will simulate execution.  No records will be
        published.

        Returns:
            tuple structure representing counts for the total number of
            records consumed and the number of domains successfully
            published to the Kafka topic

        """
        log.debug('Analyst QAs persist worker set to read %s messages',
                  max_read_count or 'all')
        log.debug('Analyst QAs persist worker timeout set to %d', self.timeout)
        log.debug('Analyst QAs persist group_id %s', group_id)

        total_messages_read = 0
        edge_count = 0

        with self.consumer(topic, group_id=group_id) as consumer:
            for message in consumer:
                total_messages_read += 1

                data = json.loads(message.value.decode('utf-8'))
                for domain, value in data.items():
                    kwargs = {'_key': domain, 'data': value}
                    self.store.collection_insert('analyst-qas', kwargs, dry)

                    edge_kwargs = {
                        '_key': domain,
                        '_from': 'domain/{}'.format(domain),
                        '_to': 'analyst-qas/{}'.format(domain),
                    }
                    if self.store.edge_insert('marked', edge_kwargs, dry):
                        edge_count += 1

                if (max_read_count is not None
                        and total_messages_read >= max_read_count):
                    log.info('Max read threshold %d breached: exiting',
                             max_read_count)
                    break

        log.info('Analyst QAs read|edge put count %d|%d', total_messages_read,
                 edge_count)

        return (total_messages_read, edge_count)
예제 #20
0
파일: stages.py 프로젝트: loum/domain-intel
    def publish(self, payloads):
        """publish arbitrary data into producer. use case would be if
        this is the first stage in a pipeline and doesnt read from anywhere"""

        if not self.is_producer:
            raise GeoDNSError("cannot publish without > 0 topics")

        self._init_kafka()
        metrics = self.metrics

        for i, payload in enumerate(payloads):
            log.debug("publishing %s", payload)
            for dest_topic in self.kafka_producer_topics:
                if not self.dry:
                    self.kafka_producer.send(dest_topic, value=payload)
                else:
                    log.debug("%s: %s", dest_topic, payload)
                    if self.dump:
                        self._do_dump(payload, str(i), DUMP_PUBLISH)

                metrics[dest_topic] += 1
        self.kafka_producer.flush()

        return metrics
예제 #21
0
    def persist_worker(self,
                       queue,
                       max_read_count,
                       topic,
                       group_id,
                       dry):
        """Write out the SitesLinkingIn information to a persistent store.

        As this is a worker that could be part of a set of executing
        threads, the number of messages read is pushed onto the
        :class:`multiprocessing.Queue` *queue*.

        Returns:
            updated :class:`multiprocessing.Queue` *queue* instance
            with number of records processed

        """
        log.debug('SitesLinkingIn persist worker set to read %s messages',
                  max_read_count or 'all')
        log.debug('Persist worker timeout set to %d', self.timeout)
        log.debug('Persist group_id %s', group_id)

        messages_read = edge_count = 0

        with self.consumer(topic, group_id=group_id) as consumer:
            for message in consumer:
                data = message.value.decode('utf-8')

                messages_read += 1
                edge_count += self.extract_siteslinkingin(data, dry=dry)

                if (max_read_count is not None and
                        messages_read >= max_read_count):
                    log.info('Max read threshold %d breached - exiting',
                             max_read_count)
                    break

            log.debug('SitesLinkingIn persist worker messages read %d',
                      messages_read)

        queue.put((messages_read, edge_count))
예제 #22
0
    def persist_worker(self, queue, max_read_count, topic, group_id, dry):
        """TrafficHistory persistent store worker.

        As this is a worker that could be part of a set of executing
        threads, the number of messages read is pushed onto the
        :class:`multiprocessing.Queue` *queue*.

        Returns:
            updated :class:`multiprocessing.Queue` *queue* instance
            with number of records processed

        """
        log.debug('TrafficHistory persist worker set to read %s messages',
                  max_read_count or 'all')
        log.debug('TrafficHistory persist worker timeout set to %d',
                  self.timeout)
        log.debug('TrafficHistory persist group_id %s', group_id)

        total_messages_read = 0
        edge_count = 0

        with self.consumer(topic, group_id=group_id) as consumer:
            for message in consumer:
                total_messages_read += 1

                data = json.loads(message.value.decode('utf-8'))
                parser = domain_intel.parser.TrafficHistory(data)
                self.store.collection_insert('traffic',
                                             parser.db_traffichistory_raw(),
                                             dry)

                if self.store.edge_insert('visit', parser.db_visit_edge(),
                                          dry):
                    edge_count += 1

                if (max_read_count is not None
                        and total_messages_read >= max_read_count):
                    log.info('Max read threshold %d breached - exiting',
                             max_read_count)
                    break

            log.info('TrafficHistory persist worker messages read %d',
                     total_messages_read)

        queue.put((total_messages_read, edge_count))
예제 #23
0
    def flatten_worker(self,
                       queue,
                       max_read_count,
                       topic,
                       group_id,
                       dry=False):
        """Read all Alexa results from the Kafka partitions.

        As this is a worker that could be part of a set of executing
        threads, the number of messages read is pushed onto the
        :class:`multiprocessing.Queue` *queue*.

        The parameter list is as per :meth:`flatten_domains`.

        Returns:
            updated :class:`multiprocessing.Queue` *queue* instance
            with number of records processed

        """
        log.debug('UrlInfo flatten worker set to read %s messages',
                  max_read_count or 'all')
        log.debug('UrlInfo flatten worker timeout set to %d', self.timeout)

        with self.producer() as producer:
            with self.consumer(topic, group_id=group_id) as consumer:
                records_read = 0
                for message in consumer:
                    records_read += 1

                    for domain in UrlInfo.flatten_batched_xml(message.value):
                        if not dry:
                            producer.send('alexa-flattened',
                                          domain.encode('utf-8'))

                    if (max_read_count is not None
                            and (records_read >= max_read_count)):
                        break

        log.debug('UrlInfo flatten worker records read %d', records_read)

        queue.put(records_read)
예제 #24
0
    def alexa_csv_dump(self,
                       max_read_count=None,
                       topic='alexa-flattened',
                       group_id='custom',
                       dry=False):
        """Simple CSV dump of targetted Alexa data.

        This method skips the read from the persistent store and
        simply reads from the flattened Alexa Kafka topic.  These messages
        present as JSON.

        *max_read_count* can limit the number of records read from *topic*.
        The default action is to read all available messages.

        The *dry* flag will simulate execution.  No output CSV will be
        created.

        Returns:
            number of messages read

        """
        log.debug('Alexa dump worker set to read %s messages', max_read_count
                  or 'all')
        log.debug('Alexa dump worker timeout set to %d', self.timeout)

        # Make sure we read files as unicode for both python 2 and 3.
        if sys.version_info.major >= 3:
            rank_csv = tempfile.NamedTemporaryFile(mode='w', delete=dry)
            country_rank_csv = tempfile.NamedTemporaryFile(mode='w',
                                                           delete=dry)
        else:
            rank_csv = tempfile.NamedTemporaryFile(delete=dry)
            country_rank_csv = tempfile.NamedTemporaryFile(delete=dry)

        rank_writer = csv.writer(rank_csv)
        country_rank_writer = csv.writer(country_rank_csv)

        with self.consumer(topic, group_id) as consumer:
            messages_read = 0
            for message in consumer:
                messages_read += 1
                flattened_alexa = message.value.decode('utf-8')
                stats = UrlInfo.alexa_flattened_extract(flattened_alexa)
                rank_writer.writerow(stats[0])
                if stats[1]:
                    country_rank_writer.writerows(stats[1])

                if messages_read % 10000 == 0:
                    log.info('Exported %d domains to CSV', messages_read)

                if (max_read_count is not None
                        and messages_read >= max_read_count):
                    log.info('Maximum read threshold %d breached - exiting',
                             max_read_count)
                    break

        log.info('Global rank file %s', rank_csv.name)
        log.info('Country rank file %s', country_rank_csv.name)
        log.info('Alexa dump worker domains read %d', messages_read)

        rank_csv.close()
        country_rank_csv.close()

        return messages_read
예제 #25
0
파일: stages.py 프로젝트: loum/domain-intel
 def _do_dump(self, payload, offset, subdir):
     log.debug("DUMPING TO %s/%s/%s with value: %s", self.dump, subdir,
               offset, payload)
     with open("%s/%s/%s" % (self.dump, subdir, offset), "wb") as _fh:
         _fh.write(payload)
예제 #26
0
파일: stages.py 프로젝트: loum/domain-intel
    def run(self):
        self._init_kafka()

        # preflight checks, since run presumes and input and output side
        # we must validate that we have what we need.
        # this is not done in the constructor to support special case stages
        # i.e. root and final leaf node
        if self.kafka_consumer_group_id is None:
            raise GeoDNSError(
                "will not accept null kafka_consumer_group_id. set one if you are consuming"
            )

        if self.worker is None:
            raise GeoDNSError("need a worker!")

        if not self.is_producer and self.is_consumer:
            raise GeoDNSError(
                "cannot call run() without input and output topics")

        self.kafka_consumer.subscribe(self.kafka_consumer_topics)

        metrics = self.metrics
        for msg in self.kafka_consumer:
            metrics["messages_received"] += 1

            if self.dump:
                self._do_dump(msg.value, str(metrics["messages_received"]),
                              DUMP_CONSUME)

            last_exc = None
            for retry in range(0, self.retryable_exceptions_count):
                try:

                    # enforce process level timeout with signals
                    old_alarm_handler = signal.signal(
                        signal.SIGALRM, GeoDNSStage._timeout_handler)
                    signal.alarm(self.worker_timeout_seconds)

                    res = self.worker(msg.value)

                    signal.alarm(0)
                    signal.signal(signal.SIGALRM, old_alarm_handler)

                    last_exc = None
                    break
                except self.retryable_exceptions + (WorkerTimedOut, ) as exc:
                    log.error("caught retryable exceptions: %s", str(exc))
                    metrics["retryable_exceptions"] += 1
                    last_exc = exc
                    time.sleep(retry)

            if last_exc is not None:
                log.error("exceeded retryable exception count of %d",
                          self.retryable_exceptions_count)
                raise last_exc

            # try marshalling response
            metrics["messages_processed"] += 1
            if hasattr(res, "marshal"):
                res = res.marshal()
                metrics["responses_marshalled"] += 1

            for dest_topic in self.kafka_producer_topics:
                metrics["messages_sent"] += 1

                if not self.dry:
                    self.kafka_producer.send(dest_topic, value=res)
                else:
                    log.debug("%s: %s", dest_topic, res)
                    if self.dump:
                        self._do_dump(
                            res, "%d.%d" % (metrics["messages_received"],
                                            metrics["messages_sent"]),
                            DUMP_PUBLISH)

            self.kafka_producer.flush()
            self.kafka_consumer.commit()

            log.debug(metrics)

            if self.max_read_count is not None and metrics[
                    "messages_received"] >= self.max_read_count:
                break

        return metrics
예제 #27
0
파일: stages.py 프로젝트: loum/domain-intel
    def persist_worker(self,
                       queue,
                       max_read_count,
                       topic,
                       group_id,
                       dry=False):
        """Persist flattened (processed) GeoDNS domain data to ArangoDB
        worker.

        As this is a worker that could be part of a set of executing
        threads, the number of messages read is pushed onto the
        :class:`multiprocessing.Queue` *queue*.

        The parameter list is as per :meth:`persist`.

        Returns:
            updated :class:`multiprocessing.Queue` *queue* instance
            with number of records processed
        """
        log.debug('Data persist worker set to read %s messages', max_read_count
                  or 'all')
        timeout = domain_intel.common.CONFIG.get('timeout', 10000)
        log.debug('Persist worker timeout set to %d', timeout)

        store = domain_intel.Store()

        kafka_config = domain_intel.common.CONFIG.get('kafka', {})
        kwargs = {
            'bootstrap_servers': kafka_config.get('bootstrap_servers'),
            'group_id': group_id,
            'consumer_timeout_ms': timeout,
        }
        with domain_intel.utils.safe_consumer(topic, **kwargs) as consumer:
            messages_read = 0
            for message in consumer:
                messages_read += 1

                dns_data = message.value.decode('utf-8')
                parser = domain_intel.parser.GeoDNS(dns_data)
                store.collection_insert('geodns', parser.db_geodns_raw(), dry)

                for ipv4 in parser.db_ipv4_vertex:
                    store.collection_insert('ipv4', ipv4, dry)

                for ipv6 in parser.db_ipv6_vertex:
                    store.collection_insert('ipv6', ipv6, dry)

                for ipv4_edge in parser.db_ipv4_edge:
                    store.edge_insert('ipv4_resolves', ipv4_edge, dry)

                for ipv6_edge in parser.db_ipv6_edge:
                    store.edge_insert('ipv6_resolves', ipv6_edge, dry)

                if (max_read_count is not None
                        and messages_read >= max_read_count):
                    log.info('Maximum read threshold %d breached - exiting',
                             max_read_count)
                    break

        log.debug('Data persist worker domains read %d', messages_read)

        queue.put(messages_read)