def run(self): # Set up necessary data so we know which targets are watched: feed = yield CrawlFeed('all') logs_count = len(self.input()) # Cache targets in an appropriately unique filename (as unique as this task): hdfs_targets = yield SyncToHdfs(feed.path, '/tmp/cache/crawl-feed-%s-%s-%i.json' % (self.job, self.launch_id, logs_count), overwrite=True) # Turn the logs into a list: log_paths = [] for log_file in self.input(): log_paths.append(log_file.path) # Yield a task for processing all the current logs (Hadoop job): log_stats = yield AnalyseLogFile(self.job, self.launch_id, log_paths, hdfs_targets.path, True) # If we are looking at documents, extract them: if self.extract_documents: yield AnalyseAndProcessDocuments(self.job, self.launch_id, log_paths, hdfs_targets.path, True) # And clean out the file from temp: logger.warning("Removing temporary targets cache: %s" % hdfs_targets.path) hdfs_targets.remove()
def _get_json(self, url): js = None try: logger.info("Getting URL: %s" % url) r = requests.get(url, headers=self.get_headers) if r.status_code == 200: js = json.loads(r.content) else: logger.info(r.status_code) logger.info(r.text) except: logger.warning(str(sys.exc_info()[0])) logger.warning(str(traceback.format_exc())) return js
def find_watched_target_for(self, url, source, publishers): ''' Given a URL and an array of publisher strings, determine which Watched Target to associate them with. ''' # Find the list of Targets where a seed matches the given URL surt = url_to_surt(url, host_only=True) matches = [] for t in self.targets: if t['watched']: a_match = False for seed in t['seeds']: if surt.startswith(url_to_surt(seed, host_only=True)): a_match = True if a_match: matches.append(t) # No matches: if len(matches) == 0: logger.error("No match found for url %s" % url) return None # raise Exception("No matching target for url "+url) # If one match, assume that is the right Target: if len(matches) == 1: return int(matches[0]['id']) # # Else multiple matches, so need to disambiguate. # # Attempt to disambiguate based on source ONLY: if source is not None: for t in matches: for seed in t['seeds']: logger.info("Looking for source match '%s' against '%s' " % (source, seed)) if seed == source: # return int(t['id']) logger.info("Found match source+seed but this is not enough to disambiguate longer crawls.") break # Then attempt to disambiguate based on publisher # FIXME Make this a bit more forgiving of punctation/minor differences title_matches = [] for t in matches: for publisher in publishers: logger.info("Looking for publisher match '%s' in title '%s' " % (publisher, t['title'])) if publisher and publisher.lower() in t['title'].lower(): title_matches.append(t) break if len(title_matches) == 0: logger.warning("No matching title to associate with url %s " % url) return None # raise Exception("No matching title to associate with url %s " % url) elif len(title_matches) == 1: return int(title_matches[0]['id']) else: logger.warning("Too many matching titles for %s" % url) for t in title_matches: logger.warning("Candidate: %d %s " % (t['id'], t['title'])) logger.warning("Assuming first match is sufficient... (%s)" % title_matches[0]['title']) return int(title_matches[0]['id'])
def run(self): # Load the targets: with self.input().open() as f: all_targets = json.load(f) # Grab detailed target data: logger.info("Filtering detailed information for %i targets..." % len(all_targets)) # Filter... targets = [] for t in all_targets: if t['crawl_frequency'] is None: logger.warning("No crawl frequency set for %s" % t) elif t['crawl_frequency'].lower() == self.frequency.lower(): targets.append(t) # Persist to disk: with self.output().open('w') as f: f.write('{}'.format(json.dumps(targets, indent=4)))
def find_watched_target_for(self, url, source, publishers): ''' Given a URL and an array of publisher strings, determine which Watched Target to associate them with. ''' # Find the list of Targets where a seed matches the given URL surt = url_to_surt(url, host_only=True) matches = [] for t in self.targets: if t['watched']: a_match = False for seed in t['seeds']: if surt.startswith(url_to_surt(seed, host_only=True)): a_match = True if a_match: matches.append(t) # No matches: if len(matches) == 0: logger.error("No match found for url %s" % url) return None # raise Exception("No matching target for url "+url) # If one match, assume that is the right Target: if len(matches) == 1: return int(matches[0]['id']) # # Else multiple matches, so need to disambiguate. # # Attempt to disambiguate based on source ONLY: if source is not None: for t in matches: for seed in t['seeds']: logger.info("Looking for source match '%s' against '%s' " % (source, seed)) if seed == source: # return int(t['id']) logger.info( "Found match source+seed but this is not enough to disambiguate longer crawls." ) break # Then attempt to disambiguate based on publisher # FIXME Make this a bit more forgiving of punctation/minor differences title_matches = [] for t in matches: for publisher in publishers: logger.info("Looking for publisher match '%s' in title '%s' " % (publisher, t['title'])) if publisher and publisher.lower() in t['title'].lower(): title_matches.append(t) break if len(title_matches) == 0: logger.warning("No matching title to associate with url %s " % url) return None # raise Exception("No matching title to associate with url %s " % url) elif len(title_matches) == 1: return int(title_matches[0]['id']) else: logger.warning("Too many matching titles for %s" % url) for t in title_matches: logger.warning("Candidate: %d %s " % (t['id'], t['title'])) logger.warning("Assuming first match is sufficient... (%s)" % title_matches[0]['title']) return int(title_matches[0]['id'])
def parse_crawl_log(self, logs): """ Parses the crawl log to check the WARCs are present. :return: """ # Set up remote connection: rf = luigi.contrib.ssh.RemoteFileSystem(self.host) warcfiles = set() remote_log = luigi.contrib.ssh.RemoteTarget(logs[0], self.host) with remote_log.open('r') as f: for line in f: parts = re.split(" +", line, maxsplit=11) # Skip failed downloads: if parts[1] == '-' or parts[1] == '' or int(parts[1]) <= 0: if parts[1] == '': logger.info( "Skipping line with empty status! '%s' from log file '%s'" % (line, logs[0])) continue # Skip locally-resolved DNS records if parts[1] == "1001": logger.debug( "Skipping finding WARC for locally-defined hostname: %s" % parts[3]) continue # Attempt to parse JSON try: (annotations, line_json) = re.split("{", parts[11], maxsplit=1) line_json = "{%s" % line_json # logger.debug("LOG JSON: %s" % line_json) # logger.debug("LOG ANNOTATIONS: %s" % annotations) jmd = json.loads(line_json) except Exception as e: logger.info("LOG LINE: %s" % line) logger.info("LOG LINE part[11]: %s" % parts[11]) logger.exception(e) raise e if 'warcFilename' in jmd: warcfiles.add(jmd['warcFilename']) elif 'warcPrefix' in jmd: for wren in remote_ls(LOCAL_WREN_FOLDER, "%s*.warc.gz*" % jmd['warcPrefix'], rf): if wren.endswith('.open'): wren = wren[:-5] warcfiles.add(os.path.basename(wren)) # Also check in case file has already been moved into output/warcs/{job}/{launch}: for wren in remote_ls(self.warc_file_path(), "%s*.warc.gz*" % jmd['warcPrefix'], rf): warcfiles.add(os.path.basename(wren)) # FIXME Also look on HDFS for matching files? else: logger.warning("No WARC file entry found for line: %s" % line) warcs = [] viral = [] for warcfile in warcfiles: if self._file_exists("%s/%s" % (self.viral_file_path(), warcfile), rf): logger.info("Found Viral WARC %s/%s" % (self.viral_file_path(), warcfile)) viral.append("%s/%s" % (self.viral_file_path(), warcfile)) elif self._file_exists("%s/%s" % (LOCAL_WREN_FOLDER, warcfile), rf): logger.info("Found WREN WARC %s" % warcfile) warcs.append("%s/%s" % (LOCAL_WREN_FOLDER, warcfile)) elif self._file_exists("%s/%s" % (self.warc_file_path(), warcfile), rf): logger.info("Found WARC %s/%s" % (self.warc_file_path(), warcfile)) warcs.append("%s/%s" % (self.warc_file_path(), warcfile)) else: raise Exception("Cannot file warc file %s" % warcfile) return warcs, viral