示例#1
0
 def output(self, crawl, exact_count=True, min_surt_hll_size=50000):
     counts = (self.pages, self.unique_urls())
     host_domain_count = HostDomainCount()
     surt_hll = None
     if self.unique_urls() >= min_surt_hll_size:
         surt_hll = HyperLogLog(HYPERLOGLOG_ERROR)
     for url, count in self.url.items():
         host_domain_count.add(url, count)
         if exact_count:
             yield (CST.url.value, self.surt_domain, url), (crawl, count)
         if surt_hll is not None:
             surt_hll.add(url)
     if exact_count:
         for digest, counts in self.digest.items():
             yield (CST.digest.value, digest), (crawl, counts)
     for mime, counts in self.mime.items():
         yield (CST.mimetype.value, mime, crawl), counts
     for mime, counts in self.mime_detected.items():
         yield (CST.mimetype_detected.value, mime, crawl), counts
     for key, val in host_domain_count.output(crawl):
         yield key, val
     yield ((CST.surt_domain.value, self.surt_domain, crawl),
            (self.pages, self.unique_urls(), len(host_domain_count.hosts)))
     if surt_hll is not None:
         yield ((CST.size_estimate_for.value, CST.surt_domain.value,
                 self.surt_domain, CST.url.value, crawl),
                (self.unique_urls(),
                 CrawlStatsJSONEncoder.json_encode_hyperloglog(surt_hll)))
     for status, counts in self.http_status.items():
         yield (CST.http_status.value, status, crawl), counts
     for url, count in self.robotstxt_url.items():
         yield (CST.size_robotstxt.value, CST.url.value, crawl), 1
         yield (CST.size_robotstxt.value, CST.page.value, crawl), count
     for status, counts in self.robotstxt_status.items():
         yield (CST.robotstxt_status.value, status, crawl), counts
 def json_decode_hyperloglog(dic):
     hll = HyperLogLog(HYPERLOGLOG_ERROR)
     hll.p = dic['p']
     hll.m = dic['m']
     hll.alpha = dic['alpha']
     hll.M = dic['M']
     return hll
示例#3
0
 def count_mapper_init(self):
     """Because cdx.gz files cannot be split and
     mapreduce.input.fileinputformat.split.minsize is set to a value larger
     than any cdx.gz file, the mapper is guaranteed to process the content
     of a single cdx file. Input lines of a cdx file are sorted by SURT URL
     which allows to aggregate URL counts for one SURT domain in memory.
     It may happen that one SURT domain spans over multiple cdx files.
     In this case (and without --exact-counts) the count of unique URLs
     and the URL histograms may be slightly off in case the same URL occurs
     also in a second cdx file. However, this problem is negligible because
     there are only 300 cdx files."""
     self.counters = Counter()
     self.cdx_path = os.environ['mapreduce_map_input_file']
     logging.info('Reading {0}'.format(self.cdx_path))
     self.crawl_name = 'unknown'
     self.crawl = None
     crawl_name_match = self.crawlpattern.search(self.cdx_path)
     if crawl_name_match is not None:
         self.crawl_name = crawl_name_match.group(1)
         self.crawl = MonthlyCrawl.get_by_name(self.crawl_name)
     else:
         raise InputError(
             "Cannot determine ID of monthly crawl from input path {}".
             format(self.cdx_path))
     self.fetches_total = 0
     self.pages_total = 0
     self.urls_total = 0
     self.urls_hll = HyperLogLog(HYPERLOGLOG_ERROR)
     self.digest_hll = HyperLogLog(HYPERLOGLOG_ERROR)
     self.url_histogram = Counter()
     self.count = None
     # first and last SURT may continue in previous/next cdx
     self.min_surt_hll_size = 1
     self.increment_counter('cdx-stats', 'cdx files processed', 1)
示例#4
0
def createcounter(nodes):
    counter = {}
    for node in nodes:
        h = HyperLogLog()
        h.update(str(node))
        counter[node] = h.reg
    return counter
def test_json_hyperloglog():
    hll1 = HyperLogLog(.01)
    for i in range(0, 50):
        hll1.add(i)
    jsons = json.dumps(hll1, cls=CrawlStatsJSONEncoder)
    hll2 = json.loads(jsons, cls=CrawlStatsJSONDecoder)
    assert(hll1.card() == hll2.card())
    # test jsonpickle serialization
    jsonp = jsonpickle.encode(hll2)
    hll3 = jsonpickle.decode(jsonp)
    assert(hll1.card() == hll3.card())
示例#6
0
def hyperloglog():
    h = HyperLogLog()
    data = []
    for pair in stream:
        for node in pair:
            data.append(node)
    for item in data:
        h.update(str(item).encode('utf8'))
    print "the number of node"
    num = count(h)
    print num
示例#7
0
 def update(self, group_key, bindings):
     """Update the aggregator with a new value for a group of bindings"""
     if self._variable in bindings:
         if group_key not in self._groups:
             self._groups[group_key] = HyperLogLog(self._error_rate)
             self._size += self._groups[group_key].size()
         self._groups[group_key].add(bindings[self._variable])
示例#8
0
def update_uids(new_uids, hll):
    hll = hll or HyperLogLog(0.01)

    for uid in new_uids:
        hll.add(uid)

    return hll
示例#9
0
文件: hll.py 项目: pombredanne/sifr
 def add(self, key, identifier):
     """
     Adds a key to the counter
     :param key:
     :param identifier: any object that can be represented
      as a string
     """
     self.counter.setdefault(key, HyperLogLog(0.005))
     self.counter[key].add(str(identifier))
 def output(self, crawl, exact_count=True, min_surt_hll_size=50000):
     counts = (self.pages, self.unique_urls())
     host_domain_count = HostDomainCount()
     surt_hll = None
     if self.unique_urls() >= min_surt_hll_size:
         surt_hll = HyperLogLog(HYPERLOGLOG_ERROR)
     for url, count in self.url.items():
         host_domain_count.add(url, count)
         if exact_count:
             yield (CST.url.value, self.surt_domain, url), (crawl, count)
         if surt_hll is not None:
             surt_hll.add(url)
     if exact_count:
         for digest, counts in self.digest.items():
             yield (CST.digest.value, digest), (crawl, counts)
     for mime, counts in self.mime.items():
         yield (CST.mimetype.value, mime, crawl), counts
     for mime, counts in self.mime_detected.items():
         yield (CST.mimetype_detected.value, mime, crawl), counts
     for charset, counts in self.charset.items():
         yield (CST.charset.value, charset, crawl), counts
     for languages, counts in self.languages.items():
         yield (CST.languages.value, languages, crawl), counts
         # yield primary language
         prim_l = languages.split(',')[0]
         yield (CST.primary_language.value, prim_l, crawl), counts
     for key, val in host_domain_count.output(crawl):
         yield key, val
     yield((CST.surt_domain.value, self.surt_domain, crawl),
           (self.pages, self.unique_urls(), len(host_domain_count.hosts)))
     if surt_hll is not None:
         yield((CST.size_estimate_for.value, CST.surt_domain.value,
                self.surt_domain, CST.url.value, crawl),
               (self.unique_urls(),
                CrawlStatsJSONEncoder.json_encode_hyperloglog(surt_hll)))
     for status, counts in self.http_status.items():
         yield (CST.http_status.value, status, crawl), counts
     for url, count in self.robotstxt_url.items():
         yield (CST.size_robotstxt.value, CST.url.value, crawl), 1
         yield (CST.size_robotstxt.value, CST.page.value, crawl), count
     for status, counts in self.robotstxt_status.items():
         yield (CST.robotstxt_status.value, status, crawl), counts
示例#11
0
    def get_unique_states(self, states=None, limits=None):
        if states is None:
            states = copy.deepcopy(self.states)

        for axis in range(len(states[0])):
            if limits is None:
                axmin, axmax = np.min(states[:, axis]), np.max(states[:, axis])
            else:
                axmin, axmax = limits[axis*2:axis*2+2]

            states[:, axis] = np.digitize(states[:, axis],
                                          np.linspace(axmin, axmax, num=100))
        states.astype(int)

        hll = HyperLogLog(0.01)
        for state in tqdm(states,
                          desc=f"Search for Unique States in whole dataset ({self.environment} @ {self.buffer_type})",
                          total=len(states)):
            hll.add(",".join([str(s) for s in state]))

        return len(hll)
示例#12
0
 def json_decode_hyperloglog(dic):
     hll = HyperLogLog(HYPERLOGLOG_ERROR)
     hll.p = dic['p']
     hll.m = dic['m']
     hll.alpha = dic['alpha']
     hll.M = dic['M']
     return hll
 def count_mapper_init(self):
     """Because cdx.gz files cannot be split and
     mapreduce.input.fileinputformat.split.minsize is set to a value larger
     than any cdx.gz file, the mapper is guaranteed to process the content
     of a single cdx file. Input lines of a cdx file are sorted by SURT URL
     which allows to aggregate URL counts for one SURT domain in memory.
     It may happen that one SURT domain spans over multiple cdx files.
     In this case (and without --exact-counts) the count of unique URLs
     and the URL histograms may be slightly off in case the same URL occurs
     also in a second cdx file. However, this problem is negligible because
     there are only 300 cdx files."""
     self.counters = Counter()
     self.cdx_path = os.environ['mapreduce_map_input_file']
     LOG.info('Reading {0}'.format(self.cdx_path))
     self.crawl_name = None
     self.crawl = None
     if self.options.crawl is not None:
         self.crawl_name = self.options.crawl
     else:
         crawl_name_match = self.crawlpattern.search(self.cdx_path)
         if crawl_name_match is not None:
             self.crawl_name = crawl_name_match.group(1)
         else:
             raise InputError(
                 "Cannot determine ID of monthly crawl from input path {}"
                 .format(self.cdx_path))
     if self.crawl_name is None:
         raise InputError("Name of crawl not given")
     self.crawl = MonthlyCrawl.get_by_name(self.crawl_name)
     self.fetches_total = 0
     self.pages_total = 0
     self.urls_total = 0
     self.urls_hll = HyperLogLog(HYPERLOGLOG_ERROR)
     self.digest_hll = HyperLogLog(HYPERLOGLOG_ERROR)
     self.url_histogram = Counter()
     self.count = None
     # first and last SURT may continue in previous/next cdx
     self.min_surt_hll_size = 1
     self.increment_counter('cdx-stats', 'cdx files processed', 1)
示例#14
0
def test_json_hyperloglog():
    hll1 = HyperLogLog(.01)
    for i in range(0, 50):
        hll1.add(i)
    jsons = json.dumps(hll1, cls=CrawlStatsJSONEncoder)
    hll2 = json.loads(jsons, cls=CrawlStatsJSONDecoder)
    assert (hll1.card() == hll2.card())
    # test jsonpickle serialization
    jsonp = jsonpickle.encode(hll2)
    hll3 = jsonpickle.decode(jsonp)
    assert (hll1.card() == hll3.card())
示例#15
0
 def cumulative_size(self):
     total_pages = 0
     for crawl in sorted(self.crawls):
         total_pages += self.size['page'][self.crawls[crawl]]
         self.add_by_type(crawl, 'page cumul.', total_pages)
     for item_type in self.hll.keys():
         item_type_cumul = ' '.join([item_type, 'cumul.'])
         item_type_new = ' '.join([item_type, 'new'])
         cumul_hll = HyperLogLog(HYPERLOGLOG_ERROR)
         n = 0
         hlls = []
         for crawl in sorted(self.hll[item_type]):
             n += 1
             hll = self.hll[item_type][crawl]
             last_cumul_hll_len = len(cumul_hll)
             cumul_hll.update(hll)
             # cumulative size
             self.add_by_type(crawl, item_type_cumul, len(cumul_hll))
             # new unseen items this crawl (since the first analyzed crawl)
             unseen = (len(cumul_hll) - last_cumul_hll_len)
             if unseen > len(hll):
                 # 1% error rate for cumulative HLLs is large in comparison
                 # to crawl size, adjust to size of items in this crawl
                 # (there can be no more new items than the size of the crawl)
                 unseen = len(hll)
             self.add_by_type(crawl, item_type_new, unseen)
             hlls.append(hll)
             # cumulative size for last N crawls
             for n_crawls in [2, 3, 4, 6, 9, 12]:
                 item_type_n_crawls = '{} cumul. last {} crawls'.format(
                     item_type, n_crawls)
                 if n_crawls <= len(hlls):
                     cum_hll = HyperLogLog(HYPERLOGLOG_ERROR)
                     for i in range(1, (n_crawls + 1)):
                         if i > len(hlls):
                             break
                         cum_hll.update(hlls[-i])
                     size_last_n = len(cum_hll)
                 else:
                     size_last_n = 'nan'
                 self.add_by_type(crawl, item_type_n_crawls, size_last_n)
    def execute(self, set_size, m, p):
        print "t", set_size, m, p

        hll = HyperLogLog(m)
        for i in range(set_size):
            hll.offer(str(i))

        estimate = hll.count()
        error = abs(estimate / float(set_size) - 1)

        strdata = hll.datastr()
        print "e", estimate, error, 1 << m, len(strdata)
        self.assertLess(len(hll.datastr()), 1 << m)
        self.assertLess(error, p)
 def cumulative_size(self):
     total_pages = 0
     for crawl in sorted(self.crawls):
         total_pages += self.size['page'][self.crawls[crawl]]
         self.add_by_type(crawl, 'page cumul.', total_pages)
     for item_type in self.hll.keys():
         item_type_cumul = ' '.join([item_type, 'cumul.'])
         item_type_new = ' '.join([item_type, 'new'])
         cumul_hll = HyperLogLog(HYPERLOGLOG_ERROR)
         n = 0
         hlls = []
         for crawl in sorted(self.hll[item_type]):
             n += 1
             hll = self.hll[item_type][crawl]
             last_cumul_hll_len = len(cumul_hll)
             cumul_hll.update(hll)
             # cumulative size
             self.add_by_type(crawl, item_type_cumul, len(cumul_hll))
             # new unseen items this crawl (since the first analyzed crawl)
             unseen = (len(cumul_hll) - last_cumul_hll_len)
             if unseen > len(hll):
                 # 1% error rate for cumulative HLLs is large in comparison
                 # to crawl size, adjust to size of items in this crawl
                 # (there can be no more new items than the size of the crawl)
                 unseen = len(hll)
             self.add_by_type(crawl, item_type_new, unseen)
             hlls.append(hll)
             # cumulative size for last N crawls
             for n_crawls in [2, 3, 6, 12]:
                 item_type_n_crawls = '{} cumul. last {} crawls'.format(
                     item_type, n_crawls)
                 if n_crawls <= len(hlls):
                     cum_hll = HyperLogLog(HYPERLOGLOG_ERROR)
                     for i in range(1, (n_crawls+1)):
                         if i > len(hlls):
                             break
                         cum_hll.update(hlls[-i])
                     size_last_n = len(cum_hll)
                 else:
                     size_last_n = 'nan'
                 self.add_by_type(crawl, item_type_n_crawls, size_last_n)
示例#18
0
 def count_reducer(self, key, values):
     outputType = key[0]
     if outputType in (CST.size.value, CST.size_robotstxt.value):
         yield key, sum(values)
     elif outputType == CST.histogram.value:
         yield key, sum(values)
     elif outputType in (CST.url.value, CST.digest.value):
         # only with --exact-counts
         crawls = MonthlyCrawlSet()
         new_crawls = set()
         page_count = MultiCount(2)
         for val in values:
             if type(val) is list:
                 if (outputType == CST.url.value):
                     (crawl, pages) = val
                     page_count.incr(crawl, pages, 1)
                 else:  # digest
                     (crawl, (pages, urls)) = val
                     page_count.incr(crawl, pages, urls)
                 crawls.add(crawl)
                 new_crawls.add(crawl)
             else:
                 # crawl set bit mask
                 crawls.update(val)
         yield key, crawls.get_bits()
         for new_crawl in new_crawls:
             if crawls.is_new(new_crawl):
                 self.counters[(CST.new_items.value, outputType,
                                new_crawl)] += 1
         # url/digest duplicate histograms
         for crawl, counts in page_count.items():
             items = (1 + counts[0] - counts[1])
             self.counters[(CST.histogram.value, outputType, crawl,
                            CST.page.value, items)] += 1
         # size in terms of unique URLs and unique content digests
         for crawl, counts in page_count.items():
             self.counters[(CST.size.value, outputType, crawl)] += 1
     elif outputType in (CST.mimetype.value, CST.mimetype_detected.value,
                         CST.charset.value, CST.languages.value,
                         CST.primary_language.value, CST.scheme.value,
                         CST.tld.value, CST.domain.value,
                         CST.surt_domain.value, CST.host.value,
                         CST.http_status.value, CST.robotstxt_status.value):
         yield key, MultiCount.sum_values(values)
     elif outputType == CST.size_estimate.value:
         hll = HyperLogLog(HYPERLOGLOG_ERROR)
         for val in values:
             hll.update(CrawlStatsJSONDecoder.json_decode_hyperloglog(val))
         yield (key, CrawlStatsJSONEncoder.json_encode_hyperloglog(hll))
     elif outputType == CST.size_estimate_for.value:
         res = None
         hll = None
         cnt = 0
         for val in values:
             if res:
                 if hll is None:
                     cnt = res[0]
                     hll = CrawlStatsJSONDecoder.json_decode_hyperloglog(
                         res[1])
                 cnt += val[0]
                 hll.update(
                     CrawlStatsJSONDecoder.json_decode_hyperloglog(val[1]))
             else:
                 res = val
         if hll is not None and cnt >= MIN_SURT_HLL_SIZE:
             yield (key,
                    (cnt,
                     CrawlStatsJSONEncoder.json_encode_hyperloglog(hll)))
         elif res[0] >= MIN_SURT_HLL_SIZE:
             yield (key, res)
     else:
         raise UnhandledTypeError(outputType)
示例#19
0
class CCStatsJob(MRJob):
    '''Job to get crawl statistics from Common Crawl index
       --job=count
            run count job (first step) to get counts
            from Common Crawl index files (cdx-*.gz)
       --job=stats
            run statistics job (second step) on output
            from count job'''

    OUTPUT_PROTOCOL = JSONProtocol

    JOBCONF = {
        'mapreduce.task.timeout': '9600000',
        'mapreduce.map.speculative': 'false',
        'mapreduce.reduce.speculative': 'false',
        'mapreduce.job.jvm.numtasks': '-1',
    }

    s3pattern = re.compile('^s3://([^/]+)/(.+)')
    gzpattern = re.compile('\.gz$')
    crawlpattern = re.compile('(CC-MAIN-2\d{3}-\d{2})')

    def configure_args(self):
        """Custom command line options for common crawl index statistics"""
        super(CCStatsJob, self).configure_args()
        self.add_passthru_arg(
            '--job',
            dest='job_to_run',
            default='',
            choices=['count', 'stats', ''],
            help='''Job(s) to run ("count", "stats", or empty to run both)''')
        self.add_passthru_arg(
            '--exact-counts',
            dest='exact_counts',
            action='store_true',
            default=None,
            help='''Exact counts for URLs and content digests,
                    this increases the output size significantly''')
        self.add_passthru_arg(
            '--no-exact-counts',
            dest='exact_counts',
            action='store_false',
            default=None,
            help='''No exact counts for URLs and content digests
                    to save storage space and computation time''')
        self.add_passthru_arg(
            '--max-top-hosts-domains',
            dest='max_hosts',
            type=int,
            default=200,
            help='''Max. number of most frequent hosts or domains shown
                    in final statistics (cf. --min-urls-top-host-domain)''')
        self.add_passthru_arg(
            '--min-urls-top-host-domain',
            dest='min_domain_frequency',
            type=int,
            default=1,
            help='''Min. number of URLs required per host or domain shown
                    in final statistics (cf. --max-top-hosts-domains).''')
        self.add_passthru_arg(
            '--min-lang-comb-freq',
            dest='min_lang_comb_freq',
            type=int,
            default=1,
            help='''Min. number of pages required for a combination of detected
                    languages to be shown in final statistics.''')
        self.add_passthru_arg(
            '--crawl',
            dest='crawl',
            default=None,
            help='''ID/name of the crawl analyzed (if not given detected
                    from input path)''')

    def input_protocol(self):
        if self.options.job_to_run != 'stats':
            LOG.debug('Reading text input from cdx files')
            return RawValueProtocol()
        LOG.debug('Reading JSON input from count job')
        return JSONProtocol()

    def hadoop_input_format(self):
        input_format = self.HADOOP_INPUT_FORMAT
        if self.options.job_to_run != 'stats':
            input_format = 'org.apache.hadoop.mapred.TextInputFormat'
        LOG.info("Setting input format for {} job: {}".format(
            self.options.job_to_run, input_format))
        return input_format

    def count_mapper_init(self):
        """Because cdx.gz files cannot be split and
        mapreduce.input.fileinputformat.split.minsize is set to a value larger
        than any cdx.gz file, the mapper is guaranteed to process the content
        of a single cdx file. Input lines of a cdx file are sorted by SURT URL
        which allows to aggregate URL counts for one SURT domain in memory.
        It may happen that one SURT domain spans over multiple cdx files.
        In this case (and without --exact-counts) the count of unique URLs
        and the URL histograms may be slightly off in case the same URL occurs
        also in a second cdx file. However, this problem is negligible because
        there are only 300 cdx files."""
        self.counters = Counter()
        self.cdx_path = os.environ['mapreduce_map_input_file']
        LOG.info('Reading {0}'.format(self.cdx_path))
        self.crawl_name = None
        self.crawl = None
        if self.options.crawl is not None:
            self.crawl_name = self.options.crawl
        else:
            crawl_name_match = self.crawlpattern.search(self.cdx_path)
            if crawl_name_match is not None:
                self.crawl_name = crawl_name_match.group(1)
            else:
                raise InputError(
                    "Cannot determine ID of monthly crawl from input path {}".
                    format(self.cdx_path))
        if self.crawl_name is None:
            raise InputError("Name of crawl not given")
        self.crawl = MonthlyCrawl.get_by_name(self.crawl_name)
        self.fetches_total = 0
        self.pages_total = 0
        self.urls_total = 0
        self.urls_hll = HyperLogLog(HYPERLOGLOG_ERROR)
        self.digest_hll = HyperLogLog(HYPERLOGLOG_ERROR)
        self.url_histogram = Counter()
        self.count = None
        # first and last SURT may continue in previous/next cdx
        self.min_surt_hll_size = 1
        self.increment_counter('cdx-stats', 'cdx files processed', 1)

    def count_mapper(self, _, line):
        self.fetches_total += 1
        if (self.fetches_total % 1000) == 0:
            self.increment_counter('cdx-stats', 'cdx lines read', 1000)
            if (self.fetches_total % 100000) == 0:
                LOG.info('Read {0} cdx lines'.format(self.fetches_total))
            else:
                LOG.debug('Read {0} cdx lines'.format(self.fetches_total))
        parts = line.split(' ')
        [surt_domain, path] = parts[0].split(')', 1)
        if self.count is None:
            self.count = SurtDomainCount(surt_domain)
        if surt_domain != self.count.surt_domain:
            # output accumulated statistics for one SURT domain
            for pair in self.count.output(self.crawl,
                                          self.options.exact_counts,
                                          self.min_surt_hll_size):
                yield pair
            self.urls_total += self.count.unique_urls()
            for url, cnt in self.count.url.items():
                self.urls_hll.add(url)
                self.url_histogram[cnt] += 1
            for digest in self.count.digest:
                self.digest_hll.add(digest)
            self.pages_total += self.count.pages
            self.count = SurtDomainCount(surt_domain)
            self.min_surt_hll_size = MIN_SURT_HLL_SIZE
        json_string = ' '.join(parts[2:])
        try:
            metadata = ujson.loads(json_string)
            self.count.add(path, metadata)
        except ValueError as e:
            LOG.error('Failed to parse json: {0} - {1}'.format(e, json_string))

    def count_mapper_final(self):
        self.increment_counter('cdx-stats', 'cdx lines read',
                               self.fetches_total % 1000)
        if self.count is None:
            return
        for pair in self.count.output(self.crawl, self.options.exact_counts,
                                      1):
            yield pair
        self.urls_total += self.count.unique_urls()
        for url, cnt in self.count.url.items():
            self.urls_hll.add(url)
            self.url_histogram[cnt] += 1
        for digest in self.count.digest:
            self.digest_hll.add(digest)
        self.pages_total += self.count.pages
        if not self.options.exact_counts:
            for count, frequency in self.url_histogram.items():
                yield ((CST.histogram.value, CST.url.value, self.crawl,
                        CST.page.value, count), frequency)
        yield (CST.size.value, CST.page.value, self.crawl), self.pages_total
        yield (CST.size.value, CST.fetch.value, self.crawl), self.fetches_total
        if not self.options.exact_counts:
            yield (CST.size.value, CST.url.value, self.crawl), self.urls_total
        yield ((CST.size_estimate.value, CST.url.value, self.crawl),
               CrawlStatsJSONEncoder.json_encode_hyperloglog(self.urls_hll))
        yield ((CST.size_estimate.value, CST.digest.value, self.crawl),
               CrawlStatsJSONEncoder.json_encode_hyperloglog(self.digest_hll))
        self.increment_counter('cdx-stats', 'cdx files finished', 1)

    def reducer_init(self):
        self.counters = Counter()
        self.mostfrequent = defaultdict(list)

    def count_reducer(self, key, values):
        outputType = key[0]
        if outputType in (CST.size.value, CST.size_robotstxt.value):
            yield key, sum(values)
        elif outputType == CST.histogram.value:
            yield key, sum(values)
        elif outputType in (CST.url.value, CST.digest.value):
            # only with --exact-counts
            crawls = MonthlyCrawlSet()
            new_crawls = set()
            page_count = MultiCount(2)
            for val in values:
                if type(val) is list:
                    if (outputType == CST.url.value):
                        (crawl, pages) = val
                        page_count.incr(crawl, pages, 1)
                    else:  # digest
                        (crawl, (pages, urls)) = val
                        page_count.incr(crawl, pages, urls)
                    crawls.add(crawl)
                    new_crawls.add(crawl)
                else:
                    # crawl set bit mask
                    crawls.update(val)
            yield key, crawls.get_bits()
            for new_crawl in new_crawls:
                if crawls.is_new(new_crawl):
                    self.counters[(CST.new_items.value, outputType,
                                   new_crawl)] += 1
            # url/digest duplicate histograms
            for crawl, counts in page_count.items():
                items = (1 + counts[0] - counts[1])
                self.counters[(CST.histogram.value, outputType, crawl,
                               CST.page.value, items)] += 1
            # size in terms of unique URLs and unique content digests
            for crawl, counts in page_count.items():
                self.counters[(CST.size.value, outputType, crawl)] += 1
        elif outputType in (CST.mimetype.value, CST.mimetype_detected.value,
                            CST.charset.value, CST.languages.value,
                            CST.primary_language.value, CST.scheme.value,
                            CST.tld.value, CST.domain.value,
                            CST.surt_domain.value, CST.host.value,
                            CST.http_status.value, CST.robotstxt_status.value):
            yield key, MultiCount.sum_values(values)
        elif outputType == CST.size_estimate.value:
            hll = HyperLogLog(HYPERLOGLOG_ERROR)
            for val in values:
                hll.update(CrawlStatsJSONDecoder.json_decode_hyperloglog(val))
            yield (key, CrawlStatsJSONEncoder.json_encode_hyperloglog(hll))
        elif outputType == CST.size_estimate_for.value:
            res = None
            hll = None
            cnt = 0
            for val in values:
                if res:
                    if hll is None:
                        cnt = res[0]
                        hll = CrawlStatsJSONDecoder.json_decode_hyperloglog(
                            res[1])
                    cnt += val[0]
                    hll.update(
                        CrawlStatsJSONDecoder.json_decode_hyperloglog(val[1]))
                else:
                    res = val
            if hll is not None and cnt >= MIN_SURT_HLL_SIZE:
                yield (key,
                       (cnt,
                        CrawlStatsJSONEncoder.json_encode_hyperloglog(hll)))
            elif res[0] >= MIN_SURT_HLL_SIZE:
                yield (key, res)
        else:
            raise UnhandledTypeError(outputType)

    def stats_mapper_init(self):
        self.counters = Counter()

    def stats_mapper(self, key, value):
        if key[0] in (CST.url.value, CST.digest.value,
                      CST.size_estimate_for.value):
            return
        if ((self.options.min_domain_frequency > 1)
                and (key[0] in (CST.host.value, CST.domain.value,
                                CST.surt_domain.value))):
            # quick skip of infrequent host and domains,
            # significantly limits amount of tuples processed in reducer
            page_count = MultiCount.get_count(0, value)
            url_count = MultiCount.get_count(1, value)
            self.counters[(CST.size.value, key[0], key[2])] += 1
            self.counters[(CST.histogram.value, key[0], key[2], CST.page.value,
                           page_count)] += 1
            self.counters[(CST.histogram.value, key[0], key[2], CST.url.value,
                           url_count)] += 1
            if key[0] in (CST.domain.value, CST.surt_domain.value):
                host_count = MultiCount.get_count(2, value)
                self.counters[(CST.histogram.value, key[0], key[2],
                               CST.host.value, host_count)] += 1
            if url_count < self.options.min_domain_frequency:
                return
        if key[0] == CST.languages.value:
            # yield only frequent language combinations (if configured)
            page_count = MultiCount.get_count(0, value)
            if ((self.options.min_lang_comb_freq > 1)
                    and (page_count < self.options.min_lang_comb_freq)
                    and (',' in key[1])):
                return
        yield key, value

    def stats_mapper_final(self):
        for (counter, count) in self.counters.items():
            yield counter, count

    def stats_reducer(self, key, values):
        outputType = CST(key[0])
        item = key[1]
        crawl = MonthlyCrawl.to_name(key[2])
        if outputType in (CST.size, CST.new_items, CST.size_estimate,
                          CST.size_robotstxt):
            verbose_key = (outputType.name, CST(item).name, crawl)
            if outputType in (CST.size, CST.size_robotstxt):
                val = sum(values)
            elif outputType == CST.new_items:
                val = MultiCount.sum_values(values)
            elif outputType == CST.size_estimate:
                # already "reduced" in count job
                for val in values:
                    break
            yield verbose_key, val
        elif outputType == CST.histogram:
            yield ((outputType.name, CST(item).name, crawl, CST(key[3]).name,
                    key[4]), sum(values))
        elif outputType in (CST.mimetype, CST.mimetype_detected, CST.charset,
                            CST.languages, CST.primary_language, CST.scheme,
                            CST.surt_domain, CST.tld, CST.domain, CST.host,
                            CST.http_status, CST.robotstxt_status):
            item = key[1]
            for counts in values:
                page_count = MultiCount.get_count(0, counts)
                url_count = MultiCount.get_count(1, counts)
                if outputType in (CST.domain, CST.surt_domain, CST.tld):
                    host_count = MultiCount.get_count(2, counts)
                if (self.options.min_domain_frequency <= 1 or outputType
                        not in (CST.host, CST.domain, CST.surt_domain)):
                    self.counters[(CST.size.name, outputType.name, crawl)] += 1
                    self.counters[(CST.histogram.name, outputType.name, crawl,
                                   CST.page.name, page_count)] += 1
                    self.counters[(CST.histogram.name, outputType.name, crawl,
                                   CST.url.name, url_count)] += 1
                    if outputType in (CST.domain, CST.surt_domain, CST.tld):
                        self.counters[(CST.histogram.name, outputType.name,
                                       crawl, CST.host.name, host_count)] += 1
                if outputType == CST.tld:
                    domain_count = MultiCount.get_count(3, counts)
                    self.counters[(CST.histogram.name, outputType.name, crawl,
                                   CST.domain.name, domain_count)] += 1
                if outputType in (CST.domain, CST.host, CST.surt_domain):
                    outKey = (outputType.name, crawl)
                    outVal = (page_count, url_count, item)
                    if outputType in (CST.domain, CST.surt_domain):
                        outVal = (page_count, url_count, host_count, item)
                    # take most common
                    if len(self.mostfrequent[outKey]) < self.options.max_hosts:
                        heapq.heappush(self.mostfrequent[outKey], outVal)
                    else:
                        heapq.heappushpop(self.mostfrequent[outKey], outVal)
                else:
                    yield ((outputType.name, item, crawl), counts)
        else:
            raise UnhandledTypeError(outputType)

    def reducer_final(self):
        for (counter, count) in self.counters.items():
            yield counter, count
        for key, mostfrequent in self.mostfrequent.items():
            (outputType, crawl) = key
            if outputType in (CST.domain.name, CST.surt_domain.name):
                for (pages, urls, hosts, item) in mostfrequent:
                    yield ((outputType, item, crawl),
                           MultiCount.compress(3, [pages, urls, hosts]))
            else:
                for (pages, urls, item) in mostfrequent:
                    yield ((outputType, item, crawl),
                           MultiCount.compress(2, [pages, urls]))

    def steps(self):
        reduces = 10
        cdxminsplitsize = 2**32  # do not split cdx map input files
        if self.options.exact_counts:
            # with exact counts need many reducers to aggregate the counts
            # in reasonable time and to get not too large partitions
            reduces = 200
        count_job = \
            MRStep(mapper_init=self.count_mapper_init,
                   mapper=self.count_mapper,
                   mapper_final=self.count_mapper_final,
                   reducer_init=self.reducer_init,
                   reducer=self.count_reducer,
                   reducer_final=self.reducer_final,
                   jobconf={'mapreduce.job.reduces': reduces,
                            'mapreduce.input.fileinputformat.split.minsize':
                                cdxminsplitsize,
                            'mapreduce.output.fileoutputformat.compress':
                                "true",
                            'mapreduce.output.fileoutputformat.compress.codec':
                                'org.apache.hadoop.io.compress.BZip2Codec'})
        stats_job = \
            MRStep(mapper_init=self.stats_mapper_init,
                   mapper=self.stats_mapper,
                   mapper_final=self.stats_mapper_final,
                   reducer_init=self.reducer_init,
                   reducer=self.stats_reducer,
                   reducer_final=self.reducer_final,
                   jobconf={'mapreduce.job.reduces': 1,
                            'mapreduce.output.fileoutputformat.compress':
                                "true",
                            'mapreduce.output.fileoutputformat.compress.codec':
                                'org.apache.hadoop.io.compress.GzipCodec'})
        if self.options.job_to_run == 'count':
            return [count_job]
        if self.options.job_to_run == 'stats':
            return [stats_job]
        return [count_job, stats_job]
from datetime import datetime, timedelta
from hll_custom import HyperLogLog as HyperLogLogCustom
from hyperloglog import HyperLogLog
from json import loads
from sseclient import SSEClient
from sys import getsizeof

start_time = datetime.now()

duration_counter = 1
max_duration = 24
duration = timedelta(hours=duration_counter)

hll = HyperLogLog(0.01)
hll_custom = HyperLogLogCustom(0.01)
naive = set()
write_counter = 0

hll_log = []
hll_size_log = []
hll_custom_log = []
hll_custom_size_log = []
naive_log = []
naive_size_log = []

url = 'https://stream.wikimedia.org/v2/stream/'
url_stream = 'recentchange'

stream = SSEClient(url + url_stream)

for event in stream:
示例#21
0
def profile(csvreader, colnames, samplesize=1000):

    # set the chunksize to be read from the file equal to the
    # number of records that are being profiled when
    # the profile size is less than the chunk size
    # this reduces unnecessary reads on the file

    stats = {}

    # build a dictionary with the initial set of statistics

    for i, col in enumerate(colnames):
        stats[i] = {
            'Name': col,
            'TypeList': [],
            'MaxVal': None,
            'MinVal': None,
            'MaxLen': None,
            'DecPlaces': None,
            'Nulls': False,
            'Cardinality': HyperLogLog(0.01)
        }

    rec_cnt = 0

    for row in csvreader:

        if rec_cnt == samplesize:
            break

        rec_cnt += 1

        for i, col in enumerate(row):

            if col:

                t = find_type(col)

                if t not in stats[i]['TypeList']:

                    stats[i]['TypeList'].append(t)

                if col > stats[i]['MaxVal']:

                    stats[i]['MaxVal'] = col

                if col < stats[i]['MinVal'] or stats[i]['MinVal'] == None:

                    stats[i]['MinVal'] = col

                v_len = len(col)

                if v_len > stats[i]['MaxLen']:

                    stats[i]['MaxLen'] = v_len

                if t == 'float':

                    dec_places = len(str(col).split('.')[1])
                    #print str(col)
                    if stats[i]['DecPlaces'] < dec_places:

                        stats[i]['DecPlaces'] = dec_places

                try:

                    stats[i]['Cardinality'].add(col)

                except:

                    pass

            else:

                stats[i]['Nulls'] = True

    filestats = []

    for key, v in enumerate(stats):

        val = stats[key]

        # set default type to string when no type was found
        if not val['TypeList']:
            val['TypeList'].append('string')

        types_found = [TYPES[t] for t in val['TypeList']]
        recommend_type = max(types_found, key=lambda x: x['weight'])

        filestats.append([
            val['Name'], ','.join(val['TypeList']), val['MaxLen'],
            val['MinVal'], val['MaxVal'], val['DecPlaces'], val['Nulls'],
            int(val['Cardinality'].card()), recommend_type['ansi']
        ])

    return rec_cnt, filestats
 def update_group(self, group_key, aggregation):
     hyperloglog = HyperLogLog(
         0.01)  # do not care about the error rate, it will be overwritten
     hyperloglog.load(aggregation['__value__'])
     self._groups[group_key].update(hyperloglog)
class CCStatsJob(MRJob):
    '''Job to get crawl statistics from Common Crawl index
       --job=count
            run count job (first step) to get counts
            from Common Crawl index files (cdx-*.gz)
       --job=stats
            run statistics job (second step) on output
            from count job'''

    OUTPUT_PROTOCOL = JSONProtocol

    JOBCONF = {
        'mapreduce.task.timeout': '9600000',
        'mapreduce.map.speculative': 'false',
        'mapreduce.reduce.speculative': 'false',
        'mapreduce.job.jvm.numtasks': '-1',
    }

    s3pattern = re.compile('^s3://([^/]+)/(.+)')
    gzpattern = re.compile('\.gz$')
    crawlpattern = re.compile('(CC-MAIN-2\d{3}-\d{2})')

    def configure_args(self):
        """Custom command line options for common crawl index statistics"""
        super(CCStatsJob, self).configure_args()
        self.add_passthru_arg(
            '--job', dest='job_to_run',
            default='', choices=['count', 'stats', ''],
            help='''Job(s) to run ("count", "stats", or empty to run both)''')
        self.add_passthru_arg(
            '--exact-counts', dest='exact_counts',
            action='store_true', default=None,
            help='''Exact counts for URLs and content digests,
                    this increases the output size significantly''')
        self.add_passthru_arg(
            '--no-exact-counts', dest='exact_counts',
            action='store_false', default=None,
            help='''No exact counts for URLs and content digests
                    to save storage space and computation time''')
        self.add_passthru_arg(
            '--max-top-hosts-domains', dest='max_hosts',
            type=int, default=200,
            help='''Max. number of most frequent hosts or domains shown
                    in final statistics (cf. --min-urls-top-host-domain)''')
        self.add_passthru_arg(
            '--min-urls-top-host-domain', dest='min_domain_frequency',
            type=int, default=1,
            help='''Min. number of URLs required per host or domain shown
                    in final statistics (cf. --max-top-hosts-domains).''')
        self.add_passthru_arg(
            '--min-lang-comb-freq', dest='min_lang_comb_freq',
            type=int, default=1,
            help='''Min. number of pages required for a combination of detected
                    languages to be shown in final statistics.''')
        self.add_passthru_arg(
            '--crawl', dest='crawl', default=None,
            help='''ID/name of the crawl analyzed (if not given detected
                    from input path)''')

    def input_protocol(self):
        if self.options.job_to_run != 'stats':
            LOG.debug('Reading text input from cdx files')
            return RawValueProtocol()
        LOG.debug('Reading JSON input from count job')
        return JSONProtocol()

    def hadoop_input_format(self):
        input_format = self.HADOOP_INPUT_FORMAT
        if self.options.job_to_run != 'stats':
            input_format = 'org.apache.hadoop.mapred.TextInputFormat'
        LOG.info("Setting input format for {} job: {}".format(
            self.options.job_to_run, input_format))
        return input_format

    def count_mapper_init(self):
        """Because cdx.gz files cannot be split and
        mapreduce.input.fileinputformat.split.minsize is set to a value larger
        than any cdx.gz file, the mapper is guaranteed to process the content
        of a single cdx file. Input lines of a cdx file are sorted by SURT URL
        which allows to aggregate URL counts for one SURT domain in memory.
        It may happen that one SURT domain spans over multiple cdx files.
        In this case (and without --exact-counts) the count of unique URLs
        and the URL histograms may be slightly off in case the same URL occurs
        also in a second cdx file. However, this problem is negligible because
        there are only 300 cdx files."""
        self.counters = Counter()
        self.cdx_path = os.environ['mapreduce_map_input_file']
        LOG.info('Reading {0}'.format(self.cdx_path))
        self.crawl_name = None
        self.crawl = None
        if self.options.crawl is not None:
            self.crawl_name = self.options.crawl
        else:
            crawl_name_match = self.crawlpattern.search(self.cdx_path)
            if crawl_name_match is not None:
                self.crawl_name = crawl_name_match.group(1)
            else:
                raise InputError(
                    "Cannot determine ID of monthly crawl from input path {}"
                    .format(self.cdx_path))
        if self.crawl_name is None:
            raise InputError("Name of crawl not given")
        self.crawl = MonthlyCrawl.get_by_name(self.crawl_name)
        self.fetches_total = 0
        self.pages_total = 0
        self.urls_total = 0
        self.urls_hll = HyperLogLog(HYPERLOGLOG_ERROR)
        self.digest_hll = HyperLogLog(HYPERLOGLOG_ERROR)
        self.url_histogram = Counter()
        self.count = None
        # first and last SURT may continue in previous/next cdx
        self.min_surt_hll_size = 1
        self.increment_counter('cdx-stats', 'cdx files processed', 1)

    def count_mapper(self, _, line):
        self.fetches_total += 1
        if (self.fetches_total % 1000) == 0:
            self.increment_counter('cdx-stats', 'cdx lines read', 1000)
            if (self.fetches_total % 100000) == 0:
                LOG.info('Read {0} cdx lines'.format(self.fetches_total))
            else:
                LOG.debug('Read {0} cdx lines'.format(self.fetches_total))
        parts = line.split(' ')
        [surt_domain, path] = parts[0].split(')', 1)
        if self.count is None:
            self.count = SurtDomainCount(surt_domain)
        if surt_domain != self.count.surt_domain:
            # output accumulated statistics for one SURT domain
            for pair in self.count.output(self.crawl,
                                          self.options.exact_counts,
                                          self.min_surt_hll_size):
                yield pair
            self.urls_total += self.count.unique_urls()
            for url, cnt in self.count.url.items():
                self.urls_hll.add(url)
                self.url_histogram[cnt] += 1
            for digest in self.count.digest:
                self.digest_hll.add(digest)
            self.pages_total += self.count.pages
            self.count = SurtDomainCount(surt_domain)
            self.min_surt_hll_size = MIN_SURT_HLL_SIZE
        json_string = ' '.join(parts[2:])
        try:
            metadata = ujson.loads(json_string)
            self.count.add(path, metadata)
        except ValueError as e:
            LOG.error('Failed to parse json: {0} - {1}'.format(
                e, json_string))

    def count_mapper_final(self):
        self.increment_counter('cdx-stats',
                               'cdx lines read', self.fetches_total % 1000)
        if self.count is None:
            return
        for pair in self.count.output(self.crawl, self.options.exact_counts, 1):
            yield pair
        self.urls_total += self.count.unique_urls()
        for url, cnt in self.count.url.items():
            self.urls_hll.add(url)
            self.url_histogram[cnt] += 1
        for digest in self.count.digest:
            self.digest_hll.add(digest)
        self.pages_total += self.count.pages
        if not self.options.exact_counts:
            for count, frequency in self.url_histogram.items():
                yield((CST.histogram.value, CST.url.value, self.crawl,
                       CST.page.value, count), frequency)
        yield (CST.size.value, CST.page.value, self.crawl), self.pages_total
        yield (CST.size.value, CST.fetch.value, self.crawl), self.fetches_total
        if not self.options.exact_counts:
            yield (CST.size.value, CST.url.value, self.crawl), self.urls_total
        yield((CST.size_estimate.value, CST.url.value, self.crawl),
              CrawlStatsJSONEncoder.json_encode_hyperloglog(self.urls_hll))
        yield((CST.size_estimate.value, CST.digest.value, self.crawl),
              CrawlStatsJSONEncoder.json_encode_hyperloglog(self.digest_hll))
        self.increment_counter('cdx-stats', 'cdx files finished', 1)

    def reducer_init(self):
        self.counters = Counter()
        self.mostfrequent = defaultdict(list)

    def count_reducer(self, key, values):
        outputType = key[0]
        if outputType in (CST.size.value, CST.size_robotstxt.value):
            yield key, sum(values)
        elif outputType == CST.histogram.value:
            yield key, sum(values)
        elif outputType in (CST.url.value, CST.digest.value):
            # only with --exact-counts
            crawls = MonthlyCrawlSet()
            new_crawls = set()
            page_count = MultiCount(2)
            for val in values:
                if type(val) is list:
                    if (outputType == CST.url.value):
                        (crawl, pages) = val
                        page_count.incr(crawl, pages, 1)
                    else:  # digest
                        (crawl, (pages, urls)) = val
                        page_count.incr(crawl, pages, urls)
                    crawls.add(crawl)
                    new_crawls.add(crawl)
                else:
                    # crawl set bit mask
                    crawls.update(val)
            yield key, crawls.get_bits()
            for new_crawl in new_crawls:
                if crawls.is_new(new_crawl):
                    self.counters[(CST.new_items.value,
                                   outputType, new_crawl)] += 1
            # url/digest duplicate histograms
            for crawl, counts in page_count.items():
                items = (1+counts[0]-counts[1])
                self.counters[(CST.histogram.value, outputType,
                               crawl, CST.page.value, items)] += 1
            # size in terms of unique URLs and unique content digests
            for crawl, counts in page_count.items():
                self.counters[(CST.size.value, outputType, crawl)] += 1
        elif outputType in (CST.mimetype.value,
                            CST.mimetype_detected.value,
                            CST.charset.value,
                            CST.languages.value,
                            CST.primary_language.value,
                            CST.scheme.value,
                            CST.tld.value,
                            CST.domain.value,
                            CST.surt_domain.value,
                            CST.host.value,
                            CST.http_status.value,
                            CST.robotstxt_status.value):
            yield key, MultiCount.sum_values(values)
        elif outputType == CST.size_estimate.value:
            hll = HyperLogLog(HYPERLOGLOG_ERROR)
            for val in values:
                hll.update(
                    CrawlStatsJSONDecoder.json_decode_hyperloglog(val))
            yield(key,
                  CrawlStatsJSONEncoder.json_encode_hyperloglog(hll))
        elif outputType == CST.size_estimate_for.value:
            res = None
            hll = None
            cnt = 0
            for val in values:
                if res:
                    if hll is None:
                        cnt = res[0]
                        hll = CrawlStatsJSONDecoder.json_decode_hyperloglog(res[1])
                    cnt += val[0]
                    hll.update(CrawlStatsJSONDecoder.json_decode_hyperloglog(val[1]))
                else:
                    res = val
            if hll is not None and cnt >= MIN_SURT_HLL_SIZE:
                yield(key, (cnt, CrawlStatsJSONEncoder.json_encode_hyperloglog(hll)))
            elif res[0] >= MIN_SURT_HLL_SIZE:
                yield(key, res)
        else:
            raise UnhandledTypeError(outputType)

    def stats_mapper_init(self):
        self.counters = Counter()

    def stats_mapper(self, key, value):
        if key[0] in (CST.url.value, CST.digest.value,
                      CST.size_estimate_for.value):
            return
        if ((self.options.min_domain_frequency > 1) and
            (key[0] in (CST.host.value, CST.domain.value,
                        CST.surt_domain.value))):
            # quick skip of infrequent host and domains,
            # significantly limits amount of tuples processed in reducer
            page_count = MultiCount.get_count(0, value)
            url_count = MultiCount.get_count(1, value)
            self.counters[(CST.size.value, key[0], key[2])] += 1
            self.counters[(CST.histogram.value, key[0],
                           key[2], CST.page.value, page_count)] += 1
            self.counters[(CST.histogram.value, key[0],
                           key[2], CST.url.value, url_count)] += 1
            if key[0] in (CST.domain.value, CST.surt_domain.value):
                host_count = MultiCount.get_count(2, value)
                self.counters[(CST.histogram.value, key[0],
                               key[2], CST.host.value, host_count)] += 1
            if url_count < self.options.min_domain_frequency:
                return
        if key[0] == CST.languages.value:
            # yield only frequent language combinations (if configured)
            page_count = MultiCount.get_count(0, value)
            if ((self.options.min_lang_comb_freq > 1) and
                    (page_count < self.options.min_lang_comb_freq) and
                    (',' in key[1])):
                return
        yield key, value

    def stats_mapper_final(self):
        for (counter, count) in self.counters.items():
            yield counter, count

    def stats_reducer(self, key, values):
        outputType = CST(key[0])
        item = key[1]
        crawl = MonthlyCrawl.to_name(key[2])
        if outputType in (CST.size, CST.new_items,
                          CST.size_estimate, CST.size_robotstxt):
            verbose_key = (outputType.name, CST(item).name, crawl)
            if outputType in (CST.size, CST.size_robotstxt):
                val = sum(values)
            elif outputType == CST.new_items:
                val = MultiCount.sum_values(values)
            elif outputType == CST.size_estimate:
                # already "reduced" in count job
                for val in values:
                    break
            yield verbose_key, val
        elif outputType == CST.histogram:
            yield((outputType.name, CST(item).name, crawl,
                   CST(key[3]).name, key[4]), sum(values))
        elif outputType in (CST.mimetype, CST.mimetype_detected, CST.charset,
                            CST.languages, CST.primary_language, CST.scheme,
                            CST.surt_domain, CST.tld, CST.domain, CST.host,
                            CST.http_status, CST.robotstxt_status):
            item = key[1]
            for counts in values:
                page_count = MultiCount.get_count(0, counts)
                url_count = MultiCount.get_count(1, counts)
                if outputType in (CST.domain, CST.surt_domain, CST.tld):
                    host_count = MultiCount.get_count(2, counts)
                if (self.options.min_domain_frequency <= 1 or
                    outputType not in (CST.host, CST.domain,
                                       CST.surt_domain)):
                    self.counters[(CST.size.name, outputType.name, crawl)] += 1
                    self.counters[(CST.histogram.name, outputType.name,
                                   crawl, CST.page.name, page_count)] += 1
                    self.counters[(CST.histogram.name, outputType.name,
                                   crawl, CST.url.name, url_count)] += 1
                    if outputType in (CST.domain, CST.surt_domain, CST.tld):
                        self.counters[(CST.histogram.name, outputType.name,
                                       crawl, CST.host.name, host_count)] += 1
                if outputType == CST.tld:
                    domain_count = MultiCount.get_count(3, counts)
                    self.counters[(CST.histogram.name, outputType.name,
                                   crawl, CST.domain.name, domain_count)] += 1
                if outputType in (CST.domain, CST.host, CST.surt_domain):
                    outKey = (outputType.name, crawl)
                    outVal = (page_count, url_count, item)
                    if outputType in (CST.domain, CST.surt_domain):
                        outVal = (page_count, url_count, host_count, item)
                    # take most common
                    if len(self.mostfrequent[outKey]) < self.options.max_hosts:
                        heapq.heappush(self.mostfrequent[outKey], outVal)
                    else:
                        heapq.heappushpop(self.mostfrequent[outKey], outVal)
                else:
                    yield((outputType.name, item, crawl), counts)
        else:
            raise UnhandledTypeError(outputType)

    def reducer_final(self):
        for (counter, count) in self.counters.items():
            yield counter, count
        for key, mostfrequent in self.mostfrequent.items():
            (outputType, crawl) = key
            if outputType in (CST.domain.name, CST.surt_domain.name):
                for (pages, urls, hosts, item) in mostfrequent:
                    yield((outputType, item, crawl),
                          MultiCount.compress(3, [pages, urls, hosts]))
            else:
                for (pages, urls, item) in mostfrequent:
                    yield((outputType, item, crawl),
                          MultiCount.compress(2, [pages, urls]))

    def steps(self):
        reduces = 10
        cdxminsplitsize = 2**32  # do not split cdx map input files
        if self.options.exact_counts:
            # with exact counts need many reducers to aggregate the counts
            # in reasonable time and to get not too large partitions
            reduces = 200
        count_job = \
            MRStep(mapper_init=self.count_mapper_init,
                   mapper=self.count_mapper,
                   mapper_final=self.count_mapper_final,
                   reducer_init=self.reducer_init,
                   reducer=self.count_reducer,
                   reducer_final=self.reducer_final,
                   jobconf={'mapreduce.job.reduces': reduces,
                            'mapreduce.input.fileinputformat.split.minsize':
                                cdxminsplitsize,
                            'mapreduce.output.fileoutputformat.compress':
                                "true",
                            'mapreduce.output.fileoutputformat.compress.codec':
                                'org.apache.hadoop.io.compress.BZip2Codec'})
        stats_job = \
            MRStep(mapper_init=self.stats_mapper_init,
                   mapper=self.stats_mapper,
                   mapper_final=self.stats_mapper_final,
                   reducer_init=self.reducer_init,
                   reducer=self.stats_reducer,
                   reducer_final=self.reducer_final,
                   jobconf={'mapreduce.job.reduces': 1,
                            'mapreduce.output.fileoutputformat.compress':
                                "true",
                            'mapreduce.output.fileoutputformat.compress.codec':
                                'org.apache.hadoop.io.compress.GzipCodec'})
        if self.options.job_to_run == 'count':
            return [count_job]
        if self.options.job_to_run == 'stats':
            return [stats_job]
        return [count_job, stats_job]
示例#24
0
    },
    {
        "name": "Log Log Register",
        "init": LLRegister,
    },
    #{
    #"name" : "LogLog",
    #"init" : lambda : LL(4),
    #},
    #{
    #"name" : "SuperLogLog",
    #"init" : lambda : SuperLL(4),
    #},
    {
        "name": "HyperLogLog",
        "init": lambda: HyperLogLog(4),
    },
    {
        "name": "KMinValues",
        "init": lambda: KMinValues(2 << 4),
    },
    {
        "name": "ScalingBloom",
        "init": lambda: ScalingBloomFilter(2048),
    },
]


def run_experiment(exp_name, filename, key_generator, data, sample_freq=3000):
    for item in data:
        item["_tmp"] = item["init"]()
 def count_reducer(self, key, values):
     outputType = key[0]
     if outputType in (CST.size.value, CST.size_robotstxt.value):
         yield key, sum(values)
     elif outputType == CST.histogram.value:
         yield key, sum(values)
     elif outputType in (CST.url.value, CST.digest.value):
         # only with --exact-counts
         crawls = MonthlyCrawlSet()
         new_crawls = set()
         page_count = MultiCount(2)
         for val in values:
             if type(val) is list:
                 if (outputType == CST.url.value):
                     (crawl, pages) = val
                     page_count.incr(crawl, pages, 1)
                 else:  # digest
                     (crawl, (pages, urls)) = val
                     page_count.incr(crawl, pages, urls)
                 crawls.add(crawl)
                 new_crawls.add(crawl)
             else:
                 # crawl set bit mask
                 crawls.update(val)
         yield key, crawls.get_bits()
         for new_crawl in new_crawls:
             if crawls.is_new(new_crawl):
                 self.counters[(CST.new_items.value,
                                outputType, new_crawl)] += 1
         # url/digest duplicate histograms
         for crawl, counts in page_count.items():
             items = (1+counts[0]-counts[1])
             self.counters[(CST.histogram.value, outputType,
                            crawl, CST.page.value, items)] += 1
         # size in terms of unique URLs and unique content digests
         for crawl, counts in page_count.items():
             self.counters[(CST.size.value, outputType, crawl)] += 1
     elif outputType in (CST.mimetype.value,
                         CST.mimetype_detected.value,
                         CST.charset.value,
                         CST.languages.value,
                         CST.primary_language.value,
                         CST.scheme.value,
                         CST.tld.value,
                         CST.domain.value,
                         CST.surt_domain.value,
                         CST.host.value,
                         CST.http_status.value,
                         CST.robotstxt_status.value):
         yield key, MultiCount.sum_values(values)
     elif outputType == CST.size_estimate.value:
         hll = HyperLogLog(HYPERLOGLOG_ERROR)
         for val in values:
             hll.update(
                 CrawlStatsJSONDecoder.json_decode_hyperloglog(val))
         yield(key,
               CrawlStatsJSONEncoder.json_encode_hyperloglog(hll))
     elif outputType == CST.size_estimate_for.value:
         res = None
         hll = None
         cnt = 0
         for val in values:
             if res:
                 if hll is None:
                     cnt = res[0]
                     hll = CrawlStatsJSONDecoder.json_decode_hyperloglog(res[1])
                 cnt += val[0]
                 hll.update(CrawlStatsJSONDecoder.json_decode_hyperloglog(val[1]))
             else:
                 res = val
         if hll is not None and cnt >= MIN_SURT_HLL_SIZE:
             yield(key, (cnt, CrawlStatsJSONEncoder.json_encode_hyperloglog(hll)))
         elif res[0] >= MIN_SURT_HLL_SIZE:
             yield(key, res)
     else:
         raise UnhandledTypeError(outputType)
from hyperloglog import HyperLogLog

sheep_seen = set()
sheep_seen_hll = HyperLogLog(0.01)

for m in range(0, 100000):
    sheep_id = str(m)
    sheep_seen.add(sheep_id)
    sheep_seen_hll.add(sheep_id)

print(f"There are {len(sheep_seen)} sheep (set).")
print(f"There are {len(sheep_seen_hll)} sheep (hyperloglog).")