Exemplo n.º 1
0
def main():
    local_logger = LocalLogger('frontier')
    local_logger.log(LogType.Info, 'Starting')

    if not __debug__:
        os.nice(-1)

    parser = argparse.ArgumentParser()
    parser.add_argument('-d', action='store_true')
    parser.add_argument('job', type=str)
    args = parser.parse_args()

    config_fetcher = ConfigFetcher(args.job)
    config_file = config_fetcher.get_config_file()

    logger = SimpleLogger(
        get_or_create_domain(AwsConnections.sdb(),
                             CrawlJobGlossary(args.job).logs_table_name),
        create_frontier_id(config_file.global_config.environment))
    try:
        if config_file.global_config.environment == ComputeEnv.AWS:
            frontier = AwsFrontier(args.job, logger)
        else:
            frontier = LocalFrontier(args.job, logger)

        seeder = FrontierSeeder(config_file.global_config, frontier)
        seeder_thread = InterruptableThread(lambda t: seeder.run())
        seeder_thread.start()

        metrics_service = MetricsService(args.job, 10)
        metrics_service.start()

        frontier_service = RemoteFrontier(frontier)
        frontier_service.start()
        logger.log(LogType.Info, 'Started')
        frontier_service.join()
        if frontier_service.threw_exception:
            logger.log(LogType.InternalError, 'Unexpectedly stopped', None,
                       frontier_service.exception, frontier_service.exc_info)
    except SqsMessageRetentionException, ex:
        logger.log(LogType.InternalWarning, "Full-stopping crawl job", None,
                   ex, sys.exc_info())
        CrawlJobController(args.job).stop()
Exemplo n.º 2
0
 def logs_table(self):
     if self._logs_table is None:
         self._logs_table = get_or_create_domain(
             self._sdb, self.glossary.logs_table_name)
     return self._logs_table
Exemplo n.º 3
0
 def skipped_urls(self):
     if self._skipped_urls is None:
         self._skipped_urls = get_or_create_domain(
             self._sdb, self.glossary.skipped_urls_table_name)
     return self._skipped_urls
Exemplo n.º 4
0
 def redirected_urls(self):
     if self._redirected_urls is None:
         self._redirected_urls = get_or_create_domain(
             self._sdb, self.glossary.redirected_urls_table_name)
     return self._redirected_urls