def _create_warc_temp_dir(self): """ Create temporary directory for WARC files. :return: the directory path """ path = os.path.join(self.working_path, "tmp", safe_string(self.message["id"])) if not os.path.exists(path): os.makedirs(path) return path
def _create_conf_file(self, harvest_id, debug, debug_warcprox, tries): # Note that giving a long time to shutdown. # Stream harvester may need to finish processing. contents = """[program:{process_group}] command={python_executable} {script} --debug={debug} --debug-warcprox={debug_warcprox} seed {seed_filepath} {working_path} --streaming --host {mq_host} --username {mq_username} --password {mq_password} --tries {tries} user={user} autostart=true autorestart=unexpected exitcodes=0,1 stopwaitsecs=900 stderr_logfile={log_path}/{safe_harvest_id}.err.log stdout_logfile={log_path}/{safe_harvest_id}.out.log """.format(process_group=self._get_process_group(harvest_id), safe_harvest_id=safe_string(harvest_id), python_executable=self.python_executable, script=self.script, seed_filepath=self._get_seed_filepath(harvest_id), working_path=self.working_path, mq_host=self.mq_host, mq_username=self.mq_username, mq_password=self.mq_password, user=self.process_owner, log_path=self.log_path, debug=debug, debug_warcprox=debug_warcprox, tries=tries) # Write the file conf_filepath = self._get_conf_filepath(harvest_id) log.debug("Writing conf to %s: %s", conf_filepath, contents) with open(conf_filepath, "wb") as f: f.write(contents) filestatus = os.stat(conf_filepath) # do a chmod +x, and add group write permissions os.chmod(conf_filepath, filestatus.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH | stat.S_IWGRP)
def _get_process_group(harvest_id): return safe_string(harvest_id)
def _get_seed_filepath(self, harvest_id): return "{}/{}.json".format(self.conf_path, safe_string(harvest_id))
def on_message(self): assert self.message log.info("Harvesting by message with id %s", self.message["id"]) self.result_filepath = os.path.join(self.working_path, "{}_result.json".format(safe_string(self.message["id"]))) # Create a temp directory for WARCs self.warc_temp_dir = self._create_warc_temp_dir() self._create_state_store() # Possibly resume a harvest self.result = HarvestResult() self.result.started = datetime_now() if os.path.exists(self.result_filepath) or len(self._list_warcs(self.warc_temp_dir)) > 0: self._load_result() self.result.warnings.append( Msg(CODE_HARVEST_RESUMED, "Harvest resumed on {}".format(datetime_now()))) # Send a status message. This will give immediate indication that harvesting is occurring. self._send_status_message(STATUS_RUNNING) self._queue_warc_files() else: # Send a status message. This will give immediate indication that harvesting is occurring. self._send_status_message(STATUS_RUNNING) # stop_harvest_loop_event tells the harvester to stop looping. # Only streaming harvesters loop. # For other harvesters, this is tripped after the first entrance into loop. self.stop_harvest_loop_event = threading.Event() # Supervisor sends a signal, indicating that the harvester should stop. # This is a graceful shutdown. Harvesting seeds is stopped and processing # is finished. This may take some time. def shutdown(signal_number, stack_frame): log.debug("Shutdown triggered") self.stop_harvest_loop_event.set() # stop_event tells the harvester to stop harvest_seeds. # This will allow warcprox to exit. self.stop_harvest_seeds_event.set() if self.restart_stream_timer: self.restart_stream_timer.cancel() if self.queue_warc_files_timer: self.queue_warc_files_timer.cancel() signal.signal(signal.SIGTERM, shutdown) signal.signal(signal.SIGINT, shutdown) log.debug("Message is %s" % json.dumps(self.message, indent=4)) # Setup the restart timer for streams # The restart timer stops and restarts the stream periodically. # This makes makes sure that each HTTP response is limited in size. if self.is_streaming: self.restart_stream_timer = threading.Timer(self.stream_restart_interval_secs, self._restart_stream) self.restart_stream_timer.start() # Start a queue warc files timer self.queue_warc_files_timer = threading.Timer(self.queue_warc_files_interval_secs, self._queue_warc_files) self.queue_warc_files_timer.start() while not self.stop_harvest_loop_event.is_set(): # Reset the stop_harvest_seeds_event self.stop_harvest_seeds_event = threading.Event() # If this isn't streaming then set stop_harvest_seeds_event so that looping doesn't occur. if not self.is_streaming: self.stop_harvest_loop_event.set() # Here is where the harvesting happens. try_count = 0 done = False while not done: try_count += 1 log.debug("Try {} of {}".format(try_count, self.tries)) try: if self.use_warcprox: with warced(safe_string(self.message["id"]), self.warc_temp_dir, debug=self.debug_warcprox, interrupt=self.is_streaming, rollover_time=self.warc_rollover_secs if not self.is_streaming else None): self.harvest_seeds() else: self.harvest_seeds() done = True log.debug("Done harvesting seeds.") except Exception as e: log.exception("Unknown error raised during harvest: %s", e) if try_count == self.tries: # Give up trying log.debug("Too many retries, so giving up on harvesting seeds.") done = True self.result.success = False self.result.errors.append(Msg(CODE_UNKNOWN_ERROR, str(e))) self.stop_harvest_loop_event.set() else: # Retry # Queue any WARC files self._queue_warc_files() # Wait for any WARC files to be processed log.debug("Waiting for processing to complete.") self.warc_processing_queue.join() log.debug("Processing complete.") # Queue any WARC files self._queue_warc_files() # Turn off the restart_stream_timer. if self.restart_stream_timer: self.restart_stream_timer.cancel() # Turn off the queue WARC files timer if self.queue_warc_files_timer: self.queue_warc_files_timer.cancel() # Finish processing self._finish_processing() # Delete temp dir if os.path.exists(self.warc_temp_dir): shutil.rmtree(self.warc_temp_dir) log.info("Done harvesting by message with id %s", self.message["id"])
def facebook_user_ads(self, username, nsid, iso2c, access_token): assert username or nsid limit_per_page = 500 if username and not nsid: log.debug("No FB userid, retrieving it") nsid = self.get_fbid(username) if nsid and access_token and iso2c: # start scraping request_url = "https://graph.facebook.com/v5.0/ads_archive" request_params = { "access_token": access_token, "limit": limit_per_page, "search_page_ids": str(nsid), "ad_active_status": "ALL", "ad_reached_countries": iso2c, # todo "fields": "page_name, page_id, funding_entity, ad_creation_time, ad_delivery_start_time, ad_delivery_stop_time, ad_creative_body, ad_creative_link_caption, ad_creative_link_description, ad_creative_link_title, ad_snapshot_url, demographic_distribution, region_distribution, impressions, spend, currency" } api_result = requests.get(request_url, params=request_params) print(api_result.text) random_token = ''.join( random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8)) serial_no = '00000' file_name = safe_string( self.message["id"]) + "-" + warcprox.timestamp17( ) + "-" + serial_no + "-" + random_token # write to warc with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"), "wb") as result_warc_file: log.info("Writing json-timeline result to path %s", self.warc_temp_dir) writer = WARCWriter(result_warc_file, gzip=True) def json_date_converter(o): """ Converts datetime.datetime items in facebook_scraper result to formate suitable for json.dumps""" if isinstance(o, datetime.datetime): return o.__str__() json_payload = json.dumps(api_result.json(), default=json_date_converter, ensure_ascii=False).encode("utf-8") record = writer.create_warc_record( "https://m.facebook.com/" + username, 'metadata', payload=BytesIO(json_payload), warc_content_type="application/json") writer.write_record(record) log.info("Writing scraped results to %s", self.warc_temp_dir) time.sleep(1.2) # sleep to avoid getting blocked by api else: log.debug( "Something went wrong. Is some information missing? Access token is: %s, iso2c is: %s", str(access_token), str(iso2c))
def facebook_user_bio(self, username): """Scrapes Facebook bio and returns info on the information contained on the about page (e.g. https://www.facebook.com/pg/SPD/about/?ref=page_internal) @param username: Facebook username @return: a dictionary of account attributes """ user_email_fb = self.message['credentials']['user_email_fb'] user_password_fb = self.message['credentials']['user_password_fb'] # ensure username is clean and can be accessed if username.startswith( "https://www.facebook.com/") or username.startswith( "http://www.facebook.com/"): username = re.sub(r'^.+facebook\.com\/', '', username) # possibly also remove trailing / username = re.sub(r'\/$', '', username) # created at field fb_general = base_fb_url + username # bio info fb_about = base_fb_url + username + "/about/?ref=page_internal" # site transparency (e.g. admins) m_fb_general = "http://m.facebook.com/" + username # request the html r = requests.get(fb_general) # ensure no 404's if not r: log.debug("Couldn't access profile site: %s", fb_general) return soup = BeautifulSoup(r.content, "html.parser") # scrape creation date created_at = soup.find('div', {"class": "_3qn7"}) created_at = created_at.select_one("span").text created_at = re.sub(r"(Seite erstellt)", "", created_at) created_at = created_at[3:] # scrape n of likes # find span with like number spans = soup.find('span', {"class": "_52id _50f5 _50f7"}) # isolate likes via regex likes = re.search(r'^[\d]+.[^\s]+', spans.text).group() bio_dict = { "username": fb_general, "n_likes": likes, "created_at": created_at } # request about html r_about = requests.get(fb_about) # ensure no 404's if not r_about: log.debug("Couldn't access username/about site: %s", fb_about) return about_soup = BeautifulSoup(r_about.content, "html.parser") mission_text = about_soup.find_all('div', {'class': "_4bl9"}) for divs in mission_text: describing_div = divs.find('div', {'class': '_50f4'}) content_div = divs.find('div', {'class': '_3-8w'}) if describing_div and content_div: bio_dict[describing_div.text] = content_div.text # photos # Retrieves profile and cover photo of public facebook page # bio going to the 'about' page, parsing html and getting # the links to photos from script tag, these can then be passed # harvest_media # this is not affected by the harvest_media options but will always happen all_scripts = about_soup.find_all('script') for js in all_scripts: for content in js.contents: if 'cover_photo' in content: # isolate relevant links links = re.findall(r'https\:\\/\\/scontent[^"]*', content) # remove escaped front slashes for val, link in enumerate(links): links[val] = re.sub(r'\\', "", link) self._harvest_media_url(links[val]) if m_fb_general: user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36' site_transparency_class_selector = "._a58._a5o._9_7._2rgt._1j-g._2rgt._86-3._2rgt._1j-g._2rgt" site_transparency_detail_id = "u_0_d" chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('headless') chrome_options.add_argument('start-maximised') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--window-size=1200x800') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument(f"user-agent={user_agent}") # this will connect to the selenium container starting scraping driver = webdriver.Remote("host.docker.internal:4444/wd/hub", {'browserName': 'chrome'}) driver.get("http://m.facebook.com") driver.maximize_window() # accept cookies cookies = driver.find_element_by_id('accept-cookie-banner-label') # more or less random wait to replicate user behavior, ensure politeness time.sleep(random.uniform(3, 9)) cookies.click() # Search & Enter the Email or Phone field & Enter Password username_fb = driver.find_element_by_id("m_login_email") password_fb = driver.find_element_by_id("m_login_password") submit = driver.find_element_by_css_selector("._56b_") # send keys and make sure not prepolutaed # 2fa has to be deactivated username_fb.clear() password_fb.clear() username_fb.send_keys(user_email_fb) password_fb.send_keys(user_password_fb) time.sleep(random.uniform(3, 9)) # Step 4) Click Login submit.click() time.sleep(random.uniform(3, 9)) # navigate to site driver.get(m_fb_general) time.sleep(random.uniform(3, 9)) driver.execute_script("window.scrollTo(0, 800)") # site info only loads on scroll # use class name and div content (todo) time.sleep(random.uniform(20, 25)) element = WebDriverWait(driver, 20).until( ec.presence_of_element_located( (By.CSS_SELECTOR, site_transparency_class_selector))) site_transparency = driver.find_elements_by_css_selector( site_transparency_class_selector) #site transparency should always be below about site_transparency[1].click() time.sleep(random.uniform(20, 15)) # simply get the whole text of the transparency box of site # the exact info can be extracted ex-post element = WebDriverWait(driver, 20).until( ec.presence_of_element_located( (By.ID, site_transparency_detail_id))) time.sleep(random.uniform(3, 9)) site_transparency_text = driver.find_element_by_id( site_transparency_detail_id).text time.sleep(random.uniform(3, 9)) driver.close() log.info("Finished scraping transparency box") bio_dict['transparency_text'] = site_transparency_text # ensure that only warc will be written if sites were found # else nothing will happen if r_about or r: # filename will later be converted to path # replicating pattern from https://github.com/internetarchive/warcprox/blob/f19ead00587633fe7e6ba6e3292456669755daaf/warcprox/writer.py#L69 # create random token for filename random_token = ''.join( random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8)) serial_no = '00000' file_name = safe_string( self.message["id"]) + "-" + warcprox.timestamp17( ) + "-" + serial_no + "-" + random_token with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"), "wb") as result_warc_file: log.info("Writing json-timeline result to path %s", self.warc_temp_dir) writer = WARCWriter(result_warc_file, gzip=True) def json_date_converter(o): """ Converts datetime.datetime items in facebook_scraper result to formate suitable for json.dumps""" if isinstance(o, datetime.datetime): return o.__str__() json_payload = json.dumps(bio_dict, default=json_date_converter, ensure_ascii=False).encode("utf-8") record = writer.create_warc_record( "https://m.facebook.com/" + username, 'metadata', payload=BytesIO(json_payload), warc_content_type="application/json") writer.write_record(record) log.info("Writing scraped results to %s", self.warc_temp_dir)
def facebook_user_timeline(self, seed_id, username, nsid): """This function will scrape the user timeline""" log.debug("Harvesting user %s with seed_id %s.", username, seed_id) # make sure either username or nsid is present to start scraping assert username or nsid # Possibly look up username if username and not nsid: log.debug("No FB userid, retrieving it") nsid = self.get_fbid(username) if nsid: # report back whether user id was found log.info("FB userid %s", nsid) # todo - need to add timeout and what to do if blocked # todo - post ids will sometimes be empty, account for that for incremental incremental = self.message.get("options", {}).get("incremental", False) harvest_media = self.message.get("options", {}).get("harvest_media", False) if incremental: # search for since_id of post since_id = self.state_store.get_state( __name__, u"timeline.{}.since_id".format(nsid)) scrape_result = [] for post in facebook_scraper.get_posts(nsid, pages=self.pages, extra_info=True, timeout=20): scrape_result.append(post) self.result.harvest_counter["posts"] += 1 self.result.increment_stats("posts") if harvest_media and post[ 'images']: #last condition avoids parsing empty lists (i.e. no media) log.info("Harvesting media from post") # get media content from links - should automatically be caught within warc stream # all photos on fb are jpgs, so the list comprehension checks whether this is the case # for the stream, if not (e.g. video) it will not harvest [ self._harvest_media_url(media_url) for media_url in post['images'] if 'jpg' in media_url ] if incremental and post["post_id"] == since_id: log.info( "Stopping, found last post that was previously harvested with id: %s", post["post_id"]) break # filename will later be converted to path # replicating pattern from https://github.com/internetarchive/warcprox/blob/f19ead00587633fe7e6ba6e3292456669755daaf/warcprox/writer.py#L69 # create random token for filename random_token = ''.join( random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8)) serial_no = '00000' file_name = safe_string( self.message["id"]) + "-" + warcprox.timestamp17( ) + "-" + serial_no + "-" + random_token with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"), "wb") as result_warc_file: log.info("Writing json-timeline result to path %s", self.warc_temp_dir) writer = WARCWriter(result_warc_file, gzip=True) def json_date_converter(o): """ Converts datetime.datetime items in facebook_scraper result to formate suitable for json.dumps""" if isinstance(o, datetime.datetime): return o.__str__() json_payload = json.dumps(scrape_result, default=json_date_converter, ensure_ascii=False).encode("utf-8") record = writer.create_warc_record( username, 'metadata', payload=BytesIO(json_payload), warc_content_type="application/json") writer.write_record(record) log.info("Writing scraped results to %s", self.warc_temp_dir) # write to state store incremental = self.message.get("options", {}).get("incremental", False) key = "timeline.{}.since_id".format(nsid) max_post_time = scrape_result[0].get("time") max_post_id = scrape_result[0].get("post_id") assert max_post_time and max_post_id if incremental: self.state_store.set_state( __name__, key, max_post_id) if incremental else None log.info("Wrote first scraped post to state_store") else: msg = "NSID not found for user {}".format(username) log.exception(msg) self.result.warnings.append( Msg(CODE_UID_NOT_FOUND, msg, seed_id=seed_id))
def on_message(self): assert self.message log.info("Harvesting by message with id %s", self.message["id"]) self.result_filepath = os.path.join( self.working_path, "{}_result.json".format(safe_string(self.message["id"]))) # Create a temp directory for WARCs self.warc_temp_dir = self._create_warc_temp_dir() self._create_state_store() # Possibly resume a harvest self.result = HarvestResult() self.result.started = datetime_now() if os.path.exists(self.result_filepath) or len( self._list_warcs(self.warc_temp_dir)) > 0: self._load_result() self.result.warnings.append( Msg(CODE_HARVEST_RESUMED, "Harvest resumed on {}".format(datetime_now()))) # Send a status message. This will give immediate indication that harvesting is occurring. self._send_status_message(STATUS_RUNNING) self._queue_warc_files() else: # Send a status message. This will give immediate indication that harvesting is occurring. self._send_status_message(STATUS_RUNNING) # stop_harvest_loop_event tells the harvester to stop looping. # Only streaming harvesters loop. # For other harvesters, this is tripped after the first entrance into loop. self.stop_harvest_loop_event = threading.Event() # Supervisor sends a signal, indicating that the harvester should stop. # This is a graceful shutdown. Harvesting seeds is stopped and processing # is finished. This may take some time. def shutdown(signal_number, stack_frame): log.info("Shutdown triggered") # This is for the consumer. self.should_stop = True if self.is_pause: log.info("This will be a pause of the harvest.") self.stop_harvest_loop_event.set() # stop_event tells the harvester to stop harvest_seeds. # This will allow warcprox to exit. self.stop_harvest_seeds_event.set() if self.restart_stream_timer: self.restart_stream_timer.cancel() if self.queue_warc_files_timer: self.queue_warc_files_timer.cancel() signal.signal(signal.SIGTERM, shutdown) signal.signal(signal.SIGINT, shutdown) def pause(signal_number, stack_frame): self.is_pause = True signal.signal(signal.SIGUSR1, pause) log.debug("Message is %s" % json.dumps(self.message, indent=4)) # Setup the restart timer for streams # The restart timer stops and restarts the stream periodically. # This makes makes sure that each HTTP response is limited in size. if self.is_streaming: self.restart_stream_timer = threading.Timer( self.stream_restart_interval_secs, self._restart_stream) self.restart_stream_timer.start() # Start a queue warc files timer self.queue_warc_files_timer = threading.Timer( self.queue_warc_files_interval_secs, self._queue_warc_files) self.queue_warc_files_timer.start() while not self.stop_harvest_loop_event.is_set(): # Reset the stop_harvest_seeds_event self.stop_harvest_seeds_event = threading.Event() # If this isn't streaming then set stop_harvest_seeds_event so that looping doesn't occur. if not self.is_streaming: self.stop_harvest_loop_event.set() # Here is where the harvesting happens. try_count = 0 done = False while not done: try_count += 1 log.debug("Try {} of {}".format(try_count, self.tries)) try: if self.use_warcprox: with warced(safe_string(self.message["id"]), self.warc_temp_dir, debug=self.debug_warcprox, interrupt=self.is_streaming, rollover_time=self.warc_rollover_secs if not self.is_streaming else None): self.harvest_seeds() else: self.harvest_seeds() done = True log.debug("Done harvesting seeds.") except Exception as e: log.exception("Unknown error raised during harvest: %s", e) if try_count == self.tries: # Give up trying log.debug( "Too many retries, so giving up on harvesting seeds." ) done = True self.result.success = False self.result.errors.append( Msg(CODE_UNKNOWN_ERROR, str(e))) self.stop_harvest_loop_event.set() else: # Retry # Queue any WARC files self._queue_warc_files() # Wait for any WARC files to be processed log.debug("Waiting for processing to complete.") self.warc_processing_queue.join() log.debug("Processing complete.") # Queue any WARC files self._queue_warc_files() # Turn off the restart_stream_timer. if self.restart_stream_timer: self.restart_stream_timer.cancel() # Turn off the queue WARC files timer if self.queue_warc_files_timer: self.queue_warc_files_timer.cancel() # Finish processing self._finish_processing() # Delete temp dir if os.path.exists(self.warc_temp_dir): shutil.rmtree(self.warc_temp_dir) log.info("Done harvesting by message with id %s", self.message["id"])
def test_safe_string(self): self.assertEqual("fooBAR12", safe_string("fooBAR12")) self.assertEqual("foo-bar-12", safe_string("foo.bar 12", replace_char="-"))