def main(): sql_commiter = SqlManager({ "logger": None, "get_object_fun": lambda pkg: pkg.threads }) thread_obj = FourChanThread( input_dict={ "is_pinned": False, "op_content": "OP Content", "post_datetime": datetime.datetime.now(), "image_content": "", "abbreviated": False, "body_cut_off": False, "post_num": "1020471246", "link": "URL" }) pkg = Package() pkg.threads = [thread_obj] sql_commiter.scrape_from_url("", pkg)
def put_data_in_pipeline(self, data_point): first_limb_name = self.first_limb.__name__ self.timing_manager.record_incoming_job() new_package = Package() delivery = {} delivery["package_data"] = new_package delivery["data_point"] = data_point delivery["limb_name"] = first_limb_name delivery["type"] = "job" first_limb_is_slow = self.timing_manager.is_limb_slow(None, first_limb_name) if len(self.limb_to_queue[first_limb_name]) > 3: self.create_process(first_limb_name) self.timing_manager.reset_timing_info(self.first_limb) free_process = None for process_id in self.limb_to_process_ids[first_limb_name]: self.process_id_busy_lock[process_id].acquire() if not self.process_id_is_busy[process_id]: free_process = process_id self.process_id_busy_lock[process_id].release() break self.process_id_busy_lock[process_id].release() if free_process: self.process_id_busy_lock[free_process].acquire() self.process_id_is_busy[free_process] = True self.process_id_busy_lock[free_process].release() delivery["process_id"] = free_process self.timing_manager.record_process_input(free_process) self.timing_manager.record_limb_input(first_limb_name) self.socket_handler.send_job(free_process, delivery) else: self.limb_to_queue_lock[first_limb_name].acquire() self.limb_to_queue[first_limb_name].append(delivery) self.limb_to_queue_lock[first_limb_name].release()
print(url, this_visit_time) print("Visit time received: " + str(this_visit_time)) visit_time = this_visit_time cursor.close() return visit_time def scrape_from_url(self, url, package): self.create_table_if_not_exist() self.write_url_visit_time(url) print("Linked Resources Before: " + str(package.linked_resources)) for next_resource in package.linked_resources: last_visited_time = self.fetch_last_visited_time(next_resource) print("Last visit time for " + next_resource + ": " + str(last_visited_time)) if last_visited_time and datetime.datetime.now() - last_visited_time < IGNORE_TIME_THRESHOLD: package.linked_resources.remove(next_resource) print("Linked Resources After: " + str(package.linked_resources)) if __name__ == "__main__": url = "d1b5f81e158880a30e9210172e21709c" config = {"logger": None} package = Package() package.linked_resources = ["d1b5f81e158880a30e9210172e21709c", "7a0c13eac425663e5fd6a3c7470e6dbb"] ignore_recent_obj = IgnoreRecentUrls(config) ignore_recent_obj.scrape_from_url(url, package)
super(SleepLimb, self).__init__(config_dict) self.logger = config_dict["logger"] wildcard_re = re.compile("^") self.associate_regex_with_method(wildcard_re, self.ingest) def ingest(self, url, data_package): """ Sends a text message to a pre-defined number based on attributes of data_package :param url: the URL of the page being processed :param data_package: the Package() object containing the data accrued from previous limbs :return: None """ self.logger.info("Currently processing " + url + " with the sleep limb. No action has been taken.") time.sleep(5) self.logger.info("\t Done.") if __name__ == "__main__": config = { "logger": centipede_logger.create_logger("empty_limb", logging.DEBUG) } send_text = SleepLimb(config) pack = Package() send_text.scrape_from_url("", pack)
send_text_flag = False try: send_text_flag = get_send_flag_func(data_package) except: pass if send_text_flag: message_body = self.config_dict["message_template"].format(url) client.publish(PhoneNumber=aws_sns_constants.DEST_NUMBER, Message=message_body) self.logger.debug("We are sending a text for url " + url) else: raise AttributeError("The config dict for " + self.__class__ + " must contain an attribute 'get_text_flag'.") if __name__ == "__main__": config = { "get_text_flag": lambda package: package.is_malicious, "message_template": "The thread found at {} was found to be malicious!", "logger": centipede_logger.create_logger("send_text", logging.DEBUG) } send_text = SendText(config) pack = Package() pack.is_malicious = True send_text.scrape_from_url("", pack)
comment_data = RedditComment( input_dict={ "comment_id": id, "content": body, "comment_datetime": time_string, "comment_author": username, "points": points, "source": page_url, "rank": i }) data_package.reddit_info.append(comment_data) if __name__ == "__main__": config_dict = { "logger": centipede_logger.create_logger("reddit_scraper", logging.DEBUG), "ff_binary_location": "C:\\Program Files\\Mozilla Firefox", "SPOOF_USER_AGENT": True, "USE_PROXY_SERVER": False } scraper = RedditScraper(config_dict) pkg = Package() scraper.scrape_from_url( "https://www.reddit.com/r/SelfAwarewolves/comments/ga057f/a_trans_exclusionist_has_a_brush_with_self/", pkg) print(pkg.__dict__)
self.logger.debug(data_package.threads[i].link + " is not malicious.") data_package.threads[i].is_malicious = is_malicious else: raise AttributeError("The config dict for " + str(self.__class__) + " must contain an attribute 'get_text_flag'.") if __name__ == "__main__": config = { "get_text_method": lambda package: [ thread.op_content for thread in package.threads if not thread.body_cut_off ], "logger": centipede_logger.create_logger("DetectMaliceInText", logging.DEBUG) } detect_malice_limb = DetectMaliceInText(config) package = Package() package.threads = [ FourChanThread({ "op_content": "If you live in wichita ks dont come to school tomorrow" }) ] detect_malice_limb.scrape_from_url("", package)
image_contents = base64.b64encode(contents) post_num_obj = thread_object.find("span", class_="postNum") post_num = post_num_obj.find_all("a")[1].get_text() thread_permalink = thread_page + "thread/" + str(post_num) thread_attributes = { "is_pinned": pinned, "op_content": op_content, "post_datetime": op_datetime, "image_content": image_contents, "abbreviated": False, "body_cut_off": False, "post_num": post_num, "link": thread_permalink } thread_obj = FourChanThread(thread_attributes) data_package.threads = [] data_package.threads.append(thread_obj) print(thread_obj) if __name__ == "__main__": scraper = FourChanScraper({"logger": centipede_logger}) data_package = Package() # scraper.scrape_from_url("http://boards.4chan.org/a/", data_package) for thread in data_package.threads: print(thread)