def filter_record(self, warc_record, article=None):
        url = warc_record.rec_headers.get_header('WARC-Target-URI')
        url_parts = tldextract.extract(url)
        domain = url_parts.registered_domain
        if domain not in STATE_BROADCASTERS:
            return False, article
        country = STATE_BROADCASTERS[domain]

        passed_filters, article = super().filter_record(warc_record, article)

        if not passed_filters:
            return False, article
        if article is None:
            article = NewsPlease.from_warc(warc_record)
        article.country = country
        if not article.language or not is_european_langcode(article.language):
            return False, article
        searcher = get_covid_searchers().get(article.language)
        if searcher is None:
            return False, article

        def match(key):
            return searcher.match((getattr(article, key)
                                   or "").lower().encode("utf-8"))

        if not match("title") and not match("maintext"):
            return False, article
        return True, article
示例#2
0
    def filter_record(self, warc_record, article=None):
        passed_filters, article = super().filter_record(warc_record, article)
        if not passed_filters:
            return False, article
        url = warc_record.rec_headers.get_header('WARC-Target-URI')

        def get_lang():
            nonlocal article
            if article is None:
                article = NewsPlease.from_warc(warc_record)
            return article.language

        country = detect_country(url, get_lang)
        if not country or not is_european_cc(country):
            return False, article
        article.country = country
        if article is None:
            article = NewsPlease.from_warc(warc_record)
        lang = article.language
        if not lang or not is_european_langcode(lang):
            return False, article
        # TODO: Find COVID-19 mention
        searcher = get_covid_searchers().get(lang)
        if searcher is None:
            return False, article

        def match(key):
            return searcher.match((getattr(article, key)
                                   or "").lower().encode("utf-8"))

        if match("title"):
            return True, article
        if match("maintext"):
            return True, article
        return True, article
示例#3
0
 def filter_record(self, warc_record, article=None):
     passed_filters, article = super().filter_record(warc_record, article)
     url = warc_record.rec_headers.get_header('WARC-Target-URI')
     canon_url = canonicalize_url(url)
     if canon_url not in all_urls:
         return False, article
     if article is None:
         article = NewsPlease.from_warc(warc_record)
     return True, article
示例#4
0
    def __filter_record(self, warc_record, article=None):
        """
        Returns true if a record passes all tests: hosts, publishing date
        :param warc_record:
        :return: A tuple of (True or False) and an article (might be None)
        """
        # filter by host if list is populated - empty host lists makes the process etremely slow.
        # seems like it is caused by the date checks

        if self.filter_valid_hosts:
            url = warc_record.rec_headers.get_header('WARC-Target-URI')
            # very simple check, check if one of the required host names is contained in the url of the WARC transaction
            # better would be to extract the host name from the WARC transaction Target URI and then check for equality
            # because currently something like g.co?forward_url=facebook.com would yield a positive filter test for
            # facebook.com even though the actual host is g.co
            # The below is necessary to make sure the for loop goes thru the entire list

            x = len(self.filter_valid_hosts)
            c = 0

            for valid_host in self.filter_valid_hosts:
                c = c + 1

                if valid_host in url:
                    break
                else:
                    if valid_host not in url and c == x:
                        return False, article

        # filter by date
        if self.filter_start_date or self.filter_end_date:
            if not article:
                article = NewsPlease.from_warc(warc_record)

            publishing_date = self.__get_publishing_date(article)

            if not publishing_date:
                if self.filter_strict_date:
                    return False, article
            else:
                # here we for sure have a date
                # is article published too early?
                if self.filter_start_date:
                    if publishing_date < self.filter_start_date:
                        return False, article
                if self.filter_end_date < publishing_date:
                    return False, article

        get_desc_data = self.__get_description_data(article)

        if not get_desc_data:
            return False, article
        else:
            if self.filter_text not in get_desc_data:
                return False, article
        return True, article
示例#5
0
    def __process_warc_gz_file(self, path_name):
        """
        Iterates all transactions in one WARC file and for each transaction tries to extract an article object.
        Afterwards, each article is checked against the filter criteria and if all are passed, the function
        on_valid_article_extracted is invoked with the article object.
        :param path_name:
        :return:
        """
        counter_article_total = 0
        counter_article_passed = 0
        counter_article_discarded = 0
        start_time = time.time()

        with open(path_name, 'rb') as stream:
            for record in ArchiveIterator(stream):
                try:
                    if record.rec_type == 'response':
                        counter_article_total += 1

                        # if the article passes filter tests, we notify the user
                        filter_pass, article = self.__filter_record(record)
                        if filter_pass:
                            counter_article_passed += 1

                            if not article:
                                article = NewsPlease.from_warc(record)

                            self.logger.info('article pass (%s; %s; %s)', article.sourceDomain, article.publish_date,
                                             article.title)
                            self.on_valid_article_extracted(article)
                        else:
                            counter_article_discarded += 1

                            if article:
                                self.logger.info('article discard (%s; %s; %s)', article.sourceDomain,
                                                 article.publish_date,
                                                 article.title)
                            else:
                                self.logger.info('article discard (%s)',
                                                 record.rec_headers.get_header('WARC-Target-URI'))

                        if counter_article_total % 10 == 0:
                            elapsed_secs = time.time() - start_time
                            secs_per_article = elapsed_secs / counter_article_total
                            self.logger.info('statistics')
                            self.logger.info('pass = %i, discard = %i, total = %i', counter_article_passed,
                                             counter_article_discarded, counter_article_total)
                            self.logger.info('extraction from current WARC file started %s; %f s/article',
                                             human(start_time), secs_per_article)
                except:
                    if self.continue_after_error:
                        self.logger.error('Unexpected error: %s', sys.exc_info()[0])
                        pass
                    else:
                        raise
示例#6
0
def process_warc_record(record):
    try:
        if record.rec_type == 'response':
            article = NewsPlease.from_warc(record)
            if article is not None:
                return json.dumps(article.__dict__,
                                  default=str,
                                  separators=(',', ':'))
    except Exception as e:
        log = logging.getLogger()
        log.warning('skipping record due to Exception: ' + str(e))
    return None
示例#7
0
    def process_warc_file(self, path_name):
        """
        Iterates all transactions in one WARC file and for each transaction tries to extract an article object.
        Afterwards, each article is checked against the filter criteria and if all are passed, the function
        save_article is invoked with the article object.
        :param path_name:
        :return:
        """
        total = 0
        passed = 0
        discarded = 0
        error = 0
        start_time = time.time()

        with open(path_name, "rb") as stream:
            for record in ArchiveIterator(stream):
                if record.rec_type == "warcinfo":
                    logger.info(record.raw_stream.read())
                    continue
                elif record.rec_type != "response":
                    logger.warning("WARC-Type: is not response")
                    continue
                total += 1
                # if the article passes filter tests, we notify the user
                if self.is_wanted_record(record):
                    passed += 1
                    article = NewsPlease.from_warc(record)
                    self.process_article(article)
                else:
                    discarded += 1
                    logger.debug(
                        "article discard: %s)",
                        record.rec_headers.get_header("WARC-Target-URI"),
                    )
                if total % 100 == 0:
                    logger.info(
                        "pass = %i, discard = %i, error = %i, total = %i",
                        passed,
                        discarded,
                        error,
                        total,
                    )
        secs_per_article = (time.time() - start_time) / total
        logger.info(f"extracting WARC {secs_per_article} s/article")
        self.downloaded_urls.append(self.url)
    def __filter_record(self, warc_record, article=None):
        """
        Returns true if a record passes all tests: hosts, publishing date
        :param warc_record:
        :return: A tuple of (True or False) and an article (might be None)
        """
        # filter by host
        if self.__filter_valid_hosts:
            url = warc_record.rec_headers.get_header('WARC-Target-URI')

            # very simple check, check if one of the required host names is contained in the url of the WARC transaction
            # better would be to extract the host name from the WARC transaction Target URI and then check for equality
            # because currently something like g.co?forward_url=facebook.com would yield a positive filter test for
            # facebook.com even though the actual host is g.co
            for valid_host in self.__filter_valid_hosts:
                if valid_host in url:
                    break
            else:
                return False, article

        # filter by date
        if self.__filter_start_date or self.__filter_end_date:
            if not article:
                article = NewsPlease.from_warc(warc_record)

            publishing_date = self.__get_publishing_date(warc_record, article)
            if not publishing_date:
                if self.__filter_strict_date:
                    return False, article
            else:  # here we for sure have a date
                # is article published too early?
                if self.__filter_start_date:
                    if publishing_date < self.__filter_start_date:
                        return False, article
                if self.__filter_end_date:
                    if publishing_date > self.__filter_end_date:
                        return False, article

        return True, article
示例#9
0
    def __process_warc_gz_file(self, path_name):
        """
        Iterates all transactions in one WARC file and for each transaction tries to extract an article object.
        Afterwards, each article is checked against the filter criteria and if all are passed, the function
        on_valid_article_extracted is invoked with the article object.
        :param path_name:
        :return:
        """
        counter_article_total = 0
        counter_article_passed = 0
        counter_article_discarded = 0
        start_time = time.time()

        with open(path_name, 'rb') as stream:
            # opens a file and returns a stream 'rb' = read/binary
            for record in ArchiveIterator(stream):
                try:
                    # Every WARC record shall have a type, reported in the WARC-Type field. There are eight WARC record
                    # types: 'warcinfo', 'response', 'resource', 'request', 'metadata', 'revisit', 'conversion',
                    # and 'continuation'.
                    if record.rec_type == 'response':
                        counter_article_total += 1

                        # if the article passes filter tests, we notify the user
                        # this calls the filter function and returns a True / false and the article
                        filter_pass, article = self.__filter_record(record)

                        if filter_pass:
                            counter_article_passed += 1

                            if not article:
                                article = NewsPlease.from_warc(record)
                            self.logger.info('article pass (%s; %s; %s)',
                                             article.source_domain,
                                             article.date_publish,
                                             article.title)
                            self.on_valid_article_extracted(article)
                        else:
                            counter_article_discarded += 1

                            if article:
                                self.logger.info(
                                    'article discard (%s; %s; %s)',
                                    article.source_domain,
                                    article.date_publish, article.title)
                            else:
                                self.logger.info(
                                    'article discard (%s)',
                                    record.rec_headers.get_header(
                                        'WARC-Target-URI'))

                        if counter_article_total % 10 == 0:
                            elapsed_secs = time.time() - start_time
                            secs_per_article = elapsed_secs / counter_article_total
                            self.logger.info('statistics')
                            self.logger.warning(
                                'pass = %i, discard = %i, total = %i',
                                counter_article_passed,
                                counter_article_discarded,
                                counter_article_total)
                            self.logger.warning(
                                'extraction from current WARC file started %s; %f s/article',
                                human(start_time), secs_per_article)
                except:
                    if self.continue_after_error:
                        self.logger.error('Unexpected error: %s',
                                          sys.exc_info()[0])
                        pass
                    else:
                        raise
示例#10
0
 def get_lang():
     nonlocal article
     if article is None:
         article = NewsPlease.from_warc(warc_record)
     return article.language