def __init__(self, working_path, connection=None, exchange=None, raise_exception_on_count=(0, ), shutdown_on_count=0, tries=1): BaseHarvester.__init__(self, working_path, stream_restart_interval_secs=5, warc_rollover_secs=120, tries=tries, host="localhost") if connection: self.mq_config = True self._producer_connection = connection self.exchange = exchange self.harvest_seed_call_count = 0 self.process_warc_call_count = 0 # Throw an exception on a particular loop of harvest_seeds self.raise_exception_on_count = raise_exception_on_count # Trigger shutdown on a particular loop of harvest_seeds self.shutdown_on_count = shutdown_on_count # This means it is streaming. if self.shutdown_on_count > 0: self.is_streaming = True
def _finish_processing(self): BaseHarvester._finish_processing(self) # Move job from job_dir to job_state_dir if os.path.exists(self.job_state_dir): log.debug("Deleting job state directory") shutil.rmtree(self.job_state_dir) if not os.path.exists(self.collection_path): log.debug("Creating collection path") os.makedirs(self.collection_path) log.debug("Moving job from %s to %s", self.job_dir, self.job_state_dir) shutil.move(self.job_dir, self.job_state_dir)
def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__(self, working_path, mq_config=mq_config, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.twarc = None self.connection_errors = connection_errors self.http_errors = http_errors self.extract_media = False self.extract_web_resources = False self.extract_user_profile_images = False
def __init__(self, working_path, mq_config=None, debug=False, debug_warcprox=False, tries=3): BaseHarvester.__init__(self, working_path, mq_config=mq_config, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.weiboarc = None self.incremental = False
def test_list_warcs(self): harvester = BaseHarvester(self.working_path, host="localhost") write_fake_warc(self.working_path, "test_1-20151109195229879-00000-97528-GLSS-F0G5RP-8000.warc.gz") write_fake_warc(self.working_path, "test_1-20151109195229879-00001-97528-GLSS-F0G5RP-8000.warc") write_fake_warc(self.working_path, "test_1-20151109195229879-00002-97528-GLSS-F0G5RP-8000") os.mkdir(os.path.join(self.working_path, "test_1-20151109195229879-00003-97528-GLSS-F0G5RP-8000.warc.gz")) warc_dirs = harvester._list_warcs(self.working_path) self.assertSetEqual( { "test_1-20151109195229879-00000-97528-GLSS-F0G5RP-8000.warc.gz", "test_1-20151109195229879-00001-97528-GLSS-F0G5RP-8000.warc", }, set(warc_dirs), )
def __init__(self, working_path, mq_config=None, debug=False, per_page=None, debug_warcprox=False, tries=3): BaseHarvester.__init__(self, working_path, mq_config=mq_config, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.api = None # For testing purposes self.per_page = per_page
def __init__( self, working_path, connection=None, exchange=None, raise_exception_on_count=(0,), shutdown_on_count=0, tries=1 ): BaseHarvester.__init__( self, working_path, stream_restart_interval_secs=5, warc_rollover_secs=120, tries=tries, host="localhost" ) if connection: self.mq_config = True self._producer_connection = connection self.exchange = exchange self.harvest_seed_call_count = 0 self.process_warc_call_count = 0 # Throw an exception on a particular loop of harvest_seeds self.raise_exception_on_count = raise_exception_on_count # Trigger shutdown on a particular loop of harvest_seeds self.shutdown_on_count = shutdown_on_count # This means it is streaming. if self.shutdown_on_count > 0: self.is_streaming = True
def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__( self, working_path, mq_config=mq_config, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.twarc = None self.connection_errors = connection_errors self.http_errors = http_errors
def test_list_warcs(self): harvester = BaseHarvester(self.working_path, host="localhost") write_fake_warc( self.working_path, "test_1-20151109195229879-00000-97528-GLSS-F0G5RP-8000.warc.gz") write_fake_warc( self.working_path, "test_1-20151109195229879-00001-97528-GLSS-F0G5RP-8000.warc") write_fake_warc( self.working_path, "test_1-20151109195229879-00002-97528-GLSS-F0G5RP-8000") os.mkdir( os.path.join( self.working_path, "test_1-20151109195229879-00003-97528-GLSS-F0G5RP-8000.warc.gz" )) warc_dirs = harvester._list_warcs(self.working_path) self.assertSetEqual( { "test_1-20151109195229879-00000-97528-GLSS-F0G5RP-8000.warc.gz", "test_1-20151109195229879-00001-97528-GLSS-F0G5RP-8000.warc" }, set(warc_dirs))
def __init__(self, heritrix_url, heritrix_username, heritrix_password, contact_url, working_path, mq_config=None, debug=False): BaseHarvester.__init__(self, working_path, mq_config=mq_config, debug=debug, use_warcprox=False, tries=1) # Read crawl configuration file with open("crawler-beans.cxml", 'r') as f: heritrix_config = f.read() # Replace contact url and data path self.heritrix_config = heritrix_config.replace("HERITRIX_CONTACT_URL", contact_url).replace( "HERITRIX_DATA_PATH", working_path) log.debug("Heritrix config is: %s", self.heritrix_config) self.client = Hapy(heritrix_url, username=heritrix_username, password=heritrix_password) # Create a dummy job to allow looking up the directory that Heritrix is writing to. # This directory is in the Heritrix container's working path. self.client.create_job(JOB_NAME) self.jobs_dir = self._get_jobs_dir(JOB_NAME) log.debug("Jobs dir is %s", self.jobs_dir) self.job_dir = None self.job_state_dir = None self.job_name = None self.collection_path = None
def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__( self, working_path, mq_config=mq_config, use_warcprox=True, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.connection_errors = connection_errors self.http_errors = http_errors # pages attribute for facebookscarper - how far 'back' should the scraper look? self.pages = 10000 # this is the number of pages that facebook_scraper will scrape - could later be adapted self.harvest_media_types = {'photo': True}
def __init__(self, working_path, mq_config=None, debug=False, debug_warcprox=False, tries=3): BaseHarvester.__init__(self, working_path, mq_config=mq_config, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.tumblrapi = None self.incremental = False
def __init__(self, process_interval_secs=1200, mq_config=None, debug=False): BaseHarvester.__init__(self, mq_config=mq_config, process_interval_secs=process_interval_secs, debug=debug) self.weibowarc = None