Пример #1
0
    def run(self):
        # Set up necessary data so we know which targets are watched:
        feed = yield CrawlFeed('all')
        logs_count = len(self.input())

        # Cache targets in an appropriately unique filename (as unique as this task):
        hdfs_targets = yield SyncToHdfs(feed.path,
                                        '/tmp/cache/crawl-feed-%s-%s-%i.json' %
                                        (self.job, self.launch_id, logs_count),
                                        overwrite=True)

        # Turn the logs into a list:
        log_paths = []
        for log_file in self.input():
            log_paths.append(log_file.path)

        # Yield a task for processing all the current logs (Hadoop job):
        log_stats = yield AnalyseLogFile(self.job, self.launch_id, log_paths,
                                         hdfs_targets.path, True)

        # If we are looking at documents, extract them:
        if self.extract_documents:
            yield AnalyseAndProcessDocuments(self.job, self.launch_id,
                                             log_paths, hdfs_targets.path,
                                             True)

        # And clean out the file from temp:
        logger.warning("Removing temporary targets cache: %s" %
                       hdfs_targets.path)
        hdfs_targets.remove()
Пример #2
0
 def _get_json(self, url):
     js = None
     try:
         logger.info("Getting URL: %s" % url)
         r = requests.get(url, headers=self.get_headers)
         if r.status_code == 200:
             js = json.loads(r.content)
         else:
             logger.info(r.status_code)
             logger.info(r.text)
     except:
         logger.warning(str(sys.exc_info()[0]))
         logger.warning(str(traceback.format_exc()))
     return js
Пример #3
0
    def find_watched_target_for(self, url, source, publishers):
        '''
        Given a URL and an array of publisher strings, determine which Watched Target to associate them with.
        '''
        # Find the list of Targets where a seed matches the given URL
        surt = url_to_surt(url, host_only=True)
        matches = []
        for t in self.targets:
            if t['watched']:
                a_match = False
                for seed in t['seeds']:
                    if surt.startswith(url_to_surt(seed, host_only=True)):
                        a_match = True
                if a_match:
                    matches.append(t)

        # No matches:
        if len(matches) == 0:
            logger.error("No match found for url %s" % url)
            return None
        # raise Exception("No matching target for url "+url)
        # If one match, assume that is the right Target:
        if len(matches) == 1:
            return int(matches[0]['id'])
        #
        # Else multiple matches, so need to disambiguate.
        #
        # Attempt to disambiguate based on source ONLY:
        if source is not None:
            for t in matches:
                for seed in t['seeds']:
                    logger.info("Looking for source match '%s' against '%s' " % (source, seed))
                    if seed == source:
                        # return int(t['id'])
                        logger.info("Found match source+seed but this is not enough to disambiguate longer crawls.")
                        break
        # Then attempt to disambiguate based on publisher
        # FIXME Make this a bit more forgiving of punctation/minor differences
        title_matches = []
        for t in matches:
            for publisher in publishers:
                logger.info("Looking for publisher match '%s' in title '%s' " % (publisher, t['title']))
                if publisher and publisher.lower() in t['title'].lower():
                    title_matches.append(t)
                    break
        if len(title_matches) == 0:
            logger.warning("No matching title to associate with url %s " % url)
            return None
        # raise Exception("No matching title to associate with url %s " % url)
        elif len(title_matches) == 1:
            return int(title_matches[0]['id'])
        else:
            logger.warning("Too many matching titles for %s" % url)
            for t in title_matches:
                logger.warning("Candidate: %d %s " % (t['id'], t['title']))
            logger.warning("Assuming first match is sufficient... (%s)" % title_matches[0]['title'])
            return int(title_matches[0]['id'])
Пример #4
0
    def run(self):
        # Load the targets:
        with self.input().open() as f:
            all_targets = json.load(f)

        # Grab detailed target data:
        logger.info("Filtering detailed information for %i targets..." %
                    len(all_targets))

        # Filter...
        targets = []
        for t in all_targets:
            if t['crawl_frequency'] is None:
                logger.warning("No crawl frequency set for %s" % t)
            elif t['crawl_frequency'].lower() == self.frequency.lower():
                targets.append(t)

        # Persist to disk:
        with self.output().open('w') as f:
            f.write('{}'.format(json.dumps(targets, indent=4)))
Пример #5
0
    def find_watched_target_for(self, url, source, publishers):
        '''
        Given a URL and an array of publisher strings, determine which Watched Target to associate them with.
        '''
        # Find the list of Targets where a seed matches the given URL
        surt = url_to_surt(url, host_only=True)
        matches = []
        for t in self.targets:
            if t['watched']:
                a_match = False
                for seed in t['seeds']:
                    if surt.startswith(url_to_surt(seed, host_only=True)):
                        a_match = True
                if a_match:
                    matches.append(t)

        # No matches:
        if len(matches) == 0:
            logger.error("No match found for url %s" % url)
            return None
        # raise Exception("No matching target for url "+url)
        # If one match, assume that is the right Target:
        if len(matches) == 1:
            return int(matches[0]['id'])
        #
        # Else multiple matches, so need to disambiguate.
        #
        # Attempt to disambiguate based on source ONLY:
        if source is not None:
            for t in matches:
                for seed in t['seeds']:
                    logger.info("Looking for source match '%s' against '%s' " %
                                (source, seed))
                    if seed == source:
                        # return int(t['id'])
                        logger.info(
                            "Found match source+seed but this is not enough to disambiguate longer crawls."
                        )
                        break
        # Then attempt to disambiguate based on publisher
        # FIXME Make this a bit more forgiving of punctation/minor differences
        title_matches = []
        for t in matches:
            for publisher in publishers:
                logger.info("Looking for publisher match '%s' in title '%s' " %
                            (publisher, t['title']))
                if publisher and publisher.lower() in t['title'].lower():
                    title_matches.append(t)
                    break
        if len(title_matches) == 0:
            logger.warning("No matching title to associate with url %s " % url)
            return None
        # raise Exception("No matching title to associate with url %s " % url)
        elif len(title_matches) == 1:
            return int(title_matches[0]['id'])
        else:
            logger.warning("Too many matching titles for %s" % url)
            for t in title_matches:
                logger.warning("Candidate: %d %s " % (t['id'], t['title']))
            logger.warning("Assuming first match is sufficient... (%s)" %
                           title_matches[0]['title'])
            return int(title_matches[0]['id'])
Пример #6
0
    def parse_crawl_log(self, logs):
        """
        Parses the crawl log to check the WARCs are present.
        :return:
        """
        # Set up remote connection:
        rf = luigi.contrib.ssh.RemoteFileSystem(self.host)
        warcfiles = set()
        remote_log = luigi.contrib.ssh.RemoteTarget(logs[0], self.host)
        with remote_log.open('r') as f:
            for line in f:
                parts = re.split(" +", line, maxsplit=11)
                # Skip failed downloads:
                if parts[1] == '-' or parts[1] == '' or int(parts[1]) <= 0:
                    if parts[1] == '':
                        logger.info(
                            "Skipping line with empty status! '%s' from log file '%s'"
                            % (line, logs[0]))
                    continue
                # Skip locally-resolved DNS records
                if parts[1] == "1001":
                    logger.debug(
                        "Skipping finding WARC for locally-defined hostname: %s"
                        % parts[3])
                    continue
                # Attempt to parse JSON
                try:
                    (annotations, line_json) = re.split("{",
                                                        parts[11],
                                                        maxsplit=1)
                    line_json = "{%s" % line_json
                    # logger.debug("LOG JSON: %s" % line_json)
                    # logger.debug("LOG ANNOTATIONS: %s" % annotations)
                    jmd = json.loads(line_json)
                except Exception as e:
                    logger.info("LOG LINE: %s" % line)
                    logger.info("LOG LINE part[11]: %s" % parts[11])
                    logger.exception(e)
                    raise e
                if 'warcFilename' in jmd:
                    warcfiles.add(jmd['warcFilename'])
                elif 'warcPrefix' in jmd:
                    for wren in remote_ls(LOCAL_WREN_FOLDER,
                                          "%s*.warc.gz*" % jmd['warcPrefix'],
                                          rf):
                        if wren.endswith('.open'):
                            wren = wren[:-5]
                        warcfiles.add(os.path.basename(wren))
                    # Also check in case file has already been moved into output/warcs/{job}/{launch}:
                    for wren in remote_ls(self.warc_file_path(),
                                          "%s*.warc.gz*" % jmd['warcPrefix'],
                                          rf):
                        warcfiles.add(os.path.basename(wren))
                    # FIXME Also look on HDFS for matching files?
                else:
                    logger.warning("No WARC file entry found for line: %s" %
                                   line)

        warcs = []
        viral = []
        for warcfile in warcfiles:
            if self._file_exists("%s/%s" % (self.viral_file_path(), warcfile),
                                 rf):
                logger.info("Found Viral WARC %s/%s" %
                            (self.viral_file_path(), warcfile))
                viral.append("%s/%s" % (self.viral_file_path(), warcfile))
            elif self._file_exists("%s/%s" % (LOCAL_WREN_FOLDER, warcfile),
                                   rf):
                logger.info("Found WREN WARC %s" % warcfile)
                warcs.append("%s/%s" % (LOCAL_WREN_FOLDER, warcfile))
            elif self._file_exists("%s/%s" % (self.warc_file_path(), warcfile),
                                   rf):
                logger.info("Found WARC %s/%s" %
                            (self.warc_file_path(), warcfile))
                warcs.append("%s/%s" % (self.warc_file_path(), warcfile))
            else:
                raise Exception("Cannot file warc file %s" % warcfile)

        return warcs, viral