Пример #1
0
    def put_data_in_pipeline(self, data_point):

        first_limb_name = self.first_limb.__name__

        self.timing_manager.record_incoming_job()

        new_package = Package()
        delivery = {}
        delivery["package_data"] = new_package
        delivery["data_point"] = data_point
        delivery["limb_name"] = first_limb_name
        delivery["type"] = "job"

        first_limb_is_slow = self.timing_manager.is_limb_slow(None, first_limb_name)
        if len(self.limb_to_queue[first_limb_name]) > 3:
            self.create_process(first_limb_name)
            self.timing_manager.reset_timing_info(self.first_limb)

        free_process = None
        for process_id in self.limb_to_process_ids[first_limb_name]:
            self.process_id_busy_lock[process_id].acquire()
            if not self.process_id_is_busy[process_id]:
                free_process = process_id
                self.process_id_busy_lock[process_id].release()
                break
            self.process_id_busy_lock[process_id].release()

        if free_process:
            self.process_id_busy_lock[free_process].acquire()
            self.process_id_is_busy[free_process] = True
            self.process_id_busy_lock[free_process].release()

            delivery["process_id"] = free_process
            self.timing_manager.record_process_input(free_process)
            self.timing_manager.record_limb_input(first_limb_name)
            self.socket_handler.send_job(free_process, delivery)
        else:
            self.limb_to_queue_lock[first_limb_name].acquire()
            self.limb_to_queue[first_limb_name].append(delivery)
            self.limb_to_queue_lock[first_limb_name].release()
Пример #2
0
def main():
    sql_commiter = SqlManager({
        "logger": None,
        "get_object_fun": lambda pkg: pkg.threads
    })

    thread_obj = FourChanThread(
        input_dict={
            "is_pinned": False,
            "op_content": "OP Content",
            "post_datetime": datetime.datetime.now(),
            "image_content": "",
            "abbreviated": False,
            "body_cut_off": False,
            "post_num": "1020471246",
            "link": "URL"
        })

    pkg = Package()
    pkg.threads = [thread_obj]

    sql_commiter.scrape_from_url("", pkg)
Пример #3
0
            print(url, this_visit_time)
            print("Visit time received: " + str(this_visit_time))
            visit_time = this_visit_time

        cursor.close()
        return visit_time


    def scrape_from_url(self, url, package):
        self.create_table_if_not_exist()
        self.write_url_visit_time(url)

        print("Linked Resources Before: " + str(package.linked_resources))

        for next_resource in package.linked_resources:
            last_visited_time = self.fetch_last_visited_time(next_resource)
            print("Last visit time for " + next_resource + ": " + str(last_visited_time))
            if last_visited_time and datetime.datetime.now() - last_visited_time < IGNORE_TIME_THRESHOLD:
                package.linked_resources.remove(next_resource)

        print("Linked Resources After: " + str(package.linked_resources))


if __name__ == "__main__":
    url = "d1b5f81e158880a30e9210172e21709c"
    config = {"logger": None}
    package = Package()
    package.linked_resources = ["d1b5f81e158880a30e9210172e21709c", "7a0c13eac425663e5fd6a3c7470e6dbb"]

    ignore_recent_obj = IgnoreRecentUrls(config)
    ignore_recent_obj.scrape_from_url(url, package)
Пример #4
0
        super(SleepLimb, self).__init__(config_dict)

        self.logger = config_dict["logger"]

        wildcard_re = re.compile("^")
        self.associate_regex_with_method(wildcard_re, self.ingest)

    def ingest(self, url, data_package):
        """
        Sends a text message to a pre-defined number based on attributes of data_package
        :param url: the URL of the page being processed
        :param data_package: the Package() object containing the data accrued from previous limbs
        :return: None
        """

        self.logger.info("Currently processing " + url +
                         " with the sleep limb. No action has been taken.")
        time.sleep(5)
        self.logger.info("\t Done.")


if __name__ == "__main__":
    config = {
        "logger": centipede_logger.create_logger("empty_limb", logging.DEBUG)
    }
    send_text = SleepLimb(config)

    pack = Package()
    send_text.scrape_from_url("", pack)
Пример #5
0
            comment_data = RedditComment(
                input_dict={
                    "comment_id": id,
                    "content": body,
                    "comment_datetime": time_string,
                    "comment_author": username,
                    "points": points,
                    "source": page_url,
                    "rank": i
                })

            data_package.reddit_info.append(comment_data)


if __name__ == "__main__":
    config_dict = {
        "logger": centipede_logger.create_logger("reddit_scraper",
                                                 logging.DEBUG),
        "ff_binary_location": "C:\\Program Files\\Mozilla Firefox",
        "SPOOF_USER_AGENT": True,
        "USE_PROXY_SERVER": False
    }
    scraper = RedditScraper(config_dict)

    pkg = Package()
    scraper.scrape_from_url(
        "https://www.reddit.com/r/SelfAwarewolves/comments/ga057f/a_trans_exclusionist_has_a_brush_with_self/",
        pkg)
    print(pkg.__dict__)
Пример #6
0
            image_contents = base64.b64encode(contents)

        post_num_obj = thread_object.find("span", class_="postNum")
        post_num = post_num_obj.find_all("a")[1].get_text()

        thread_permalink = thread_page + "thread/" + str(post_num)

        thread_attributes = {
            "is_pinned": pinned,
            "op_content": op_content,
            "post_datetime": op_datetime,
            "image_content": image_contents,
            "abbreviated": False,
            "body_cut_off": False,
            "post_num": post_num,
            "link": thread_permalink
        }
        thread_obj = FourChanThread(thread_attributes)

        data_package.threads = []
        data_package.threads.append(thread_obj)
        print(thread_obj)


if __name__ == "__main__":
    scraper = FourChanScraper({"logger": centipede_logger})
    data_package = Package()
    # scraper.scrape_from_url("http://boards.4chan.org/a/", data_package)

    for thread in data_package.threads:
        print(thread)