示例#1
0
 def __init__(self,
              working_path,
              connection=None,
              exchange=None,
              raise_exception_on_count=(0, ),
              shutdown_on_count=0,
              tries=1):
     BaseHarvester.__init__(self,
                            working_path,
                            stream_restart_interval_secs=5,
                            warc_rollover_secs=120,
                            tries=tries,
                            host="localhost")
     if connection:
         self.mq_config = True
         self._producer_connection = connection
     self.exchange = exchange
     self.harvest_seed_call_count = 0
     self.process_warc_call_count = 0
     # Throw an exception on a particular loop of harvest_seeds
     self.raise_exception_on_count = raise_exception_on_count
     # Trigger shutdown on a particular loop of harvest_seeds
     self.shutdown_on_count = shutdown_on_count
     # This means it is streaming.
     if self.shutdown_on_count > 0:
         self.is_streaming = True
示例#2
0
 def _finish_processing(self):
     BaseHarvester._finish_processing(self)
     # Move job from job_dir to job_state_dir
     if os.path.exists(self.job_state_dir):
         log.debug("Deleting job state directory")
         shutil.rmtree(self.job_state_dir)
     if not os.path.exists(self.collection_path):
         log.debug("Creating collection path")
         os.makedirs(self.collection_path)
     log.debug("Moving job from %s to %s", self.job_dir, self.job_state_dir)
     shutil.move(self.job_dir, self.job_state_dir)
 def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False,
              connection_errors=5, http_errors=5, debug_warcprox=False, tries=3):
     BaseHarvester.__init__(self, working_path, mq_config=mq_config,
                            stream_restart_interval_secs=stream_restart_interval_secs,
                            debug=debug, debug_warcprox=debug_warcprox, tries=tries)
     self.twarc = None
     self.connection_errors = connection_errors
     self.http_errors = http_errors
     self.extract_media = False
     self.extract_web_resources = False
     self.extract_user_profile_images = False
 def __init__(self,
              working_path,
              mq_config=None,
              debug=False,
              debug_warcprox=False,
              tries=3):
     BaseHarvester.__init__(self,
                            working_path,
                            mq_config=mq_config,
                            debug=debug,
                            debug_warcprox=debug_warcprox,
                            tries=tries)
     self.weiboarc = None
     self.incremental = False
示例#5
0
 def test_list_warcs(self):
     harvester = BaseHarvester(self.working_path, host="localhost")
     write_fake_warc(self.working_path, "test_1-20151109195229879-00000-97528-GLSS-F0G5RP-8000.warc.gz")
     write_fake_warc(self.working_path, "test_1-20151109195229879-00001-97528-GLSS-F0G5RP-8000.warc")
     write_fake_warc(self.working_path, "test_1-20151109195229879-00002-97528-GLSS-F0G5RP-8000")
     os.mkdir(os.path.join(self.working_path, "test_1-20151109195229879-00003-97528-GLSS-F0G5RP-8000.warc.gz"))
     warc_dirs = harvester._list_warcs(self.working_path)
     self.assertSetEqual(
         {
             "test_1-20151109195229879-00000-97528-GLSS-F0G5RP-8000.warc.gz",
             "test_1-20151109195229879-00001-97528-GLSS-F0G5RP-8000.warc",
         },
         set(warc_dirs),
     )
 def __init__(self,
              working_path,
              mq_config=None,
              debug=False,
              per_page=None,
              debug_warcprox=False,
              tries=3):
     BaseHarvester.__init__(self,
                            working_path,
                            mq_config=mq_config,
                            debug=debug,
                            debug_warcprox=debug_warcprox,
                            tries=tries)
     self.api = None
     # For testing purposes
     self.per_page = per_page
示例#7
0
 def __init__(
     self, working_path, connection=None, exchange=None, raise_exception_on_count=(0,), shutdown_on_count=0, tries=1
 ):
     BaseHarvester.__init__(
         self, working_path, stream_restart_interval_secs=5, warc_rollover_secs=120, tries=tries, host="localhost"
     )
     if connection:
         self.mq_config = True
         self._producer_connection = connection
     self.exchange = exchange
     self.harvest_seed_call_count = 0
     self.process_warc_call_count = 0
     # Throw an exception on a particular loop of harvest_seeds
     self.raise_exception_on_count = raise_exception_on_count
     # Trigger shutdown on a particular loop of harvest_seeds
     self.shutdown_on_count = shutdown_on_count
     # This means it is streaming.
     if self.shutdown_on_count > 0:
         self.is_streaming = True
 def __init__(self,
              working_path,
              stream_restart_interval_secs=30 * 60,
              mq_config=None,
              debug=False,
              connection_errors=5,
              http_errors=5,
              debug_warcprox=False,
              tries=3):
     BaseHarvester.__init__(
         self,
         working_path,
         mq_config=mq_config,
         stream_restart_interval_secs=stream_restart_interval_secs,
         debug=debug,
         debug_warcprox=debug_warcprox,
         tries=tries)
     self.twarc = None
     self.connection_errors = connection_errors
     self.http_errors = http_errors
示例#9
0
 def test_list_warcs(self):
     harvester = BaseHarvester(self.working_path, host="localhost")
     write_fake_warc(
         self.working_path,
         "test_1-20151109195229879-00000-97528-GLSS-F0G5RP-8000.warc.gz")
     write_fake_warc(
         self.working_path,
         "test_1-20151109195229879-00001-97528-GLSS-F0G5RP-8000.warc")
     write_fake_warc(
         self.working_path,
         "test_1-20151109195229879-00002-97528-GLSS-F0G5RP-8000")
     os.mkdir(
         os.path.join(
             self.working_path,
             "test_1-20151109195229879-00003-97528-GLSS-F0G5RP-8000.warc.gz"
         ))
     warc_dirs = harvester._list_warcs(self.working_path)
     self.assertSetEqual(
         {
             "test_1-20151109195229879-00000-97528-GLSS-F0G5RP-8000.warc.gz",
             "test_1-20151109195229879-00001-97528-GLSS-F0G5RP-8000.warc"
         }, set(warc_dirs))
示例#10
0
    def __init__(self, heritrix_url, heritrix_username, heritrix_password, contact_url,
                 working_path, mq_config=None, debug=False):
        BaseHarvester.__init__(self, working_path, mq_config=mq_config, debug=debug,
                               use_warcprox=False, tries=1)
        # Read crawl configuration file
        with open("crawler-beans.cxml", 'r') as f:
            heritrix_config = f.read()
        # Replace contact url and data path
        self.heritrix_config = heritrix_config.replace("HERITRIX_CONTACT_URL", contact_url).replace(
            "HERITRIX_DATA_PATH", working_path)
        log.debug("Heritrix config is: %s", self.heritrix_config)

        self.client = Hapy(heritrix_url, username=heritrix_username, password=heritrix_password)

        # Create a dummy job to allow looking up the directory that Heritrix is writing to.
        # This directory is in the Heritrix container's working path.
        self.client.create_job(JOB_NAME)
        self.jobs_dir = self._get_jobs_dir(JOB_NAME)
        log.debug("Jobs dir is %s", self.jobs_dir)

        self.job_dir = None
        self.job_state_dir = None
        self.job_name = None
        self.collection_path = None
示例#11
0
    def __init__(self,
                 working_path,
                 stream_restart_interval_secs=30 * 60,
                 mq_config=None,
                 debug=False,
                 connection_errors=5,
                 http_errors=5,
                 debug_warcprox=False,
                 tries=3):
        BaseHarvester.__init__(
            self,
            working_path,
            mq_config=mq_config,
            use_warcprox=True,
            stream_restart_interval_secs=stream_restart_interval_secs,
            debug=debug,
            debug_warcprox=debug_warcprox,
            tries=tries)

        self.connection_errors = connection_errors
        self.http_errors = http_errors
        # pages attribute for facebookscarper - how far 'back' should the scraper look?
        self.pages = 10000  # this is the number of pages that facebook_scraper will scrape - could later be adapted
        self.harvest_media_types = {'photo': True}
 def __init__(self, working_path, mq_config=None, debug=False, debug_warcprox=False, tries=3):
     BaseHarvester.__init__(self, working_path, mq_config=mq_config, debug=debug, debug_warcprox=debug_warcprox,
                            tries=tries)
     self.tumblrapi = None
     self.incremental = False
 def __init__(self, process_interval_secs=1200, mq_config=None, debug=False):
     BaseHarvester.__init__(self, mq_config=mq_config, process_interval_secs=process_interval_secs, debug=debug)
     self.weibowarc = None
 def __init__(self, working_path, mq_config=None, debug=False, per_page=None, debug_warcprox=False, tries=3):
     BaseHarvester.__init__(self, working_path, mq_config=mq_config, debug=debug, debug_warcprox=debug_warcprox,
                            tries=tries)
     self.api = None
     # For testing purposes
     self.per_page = per_page