Exemplo n.º 1
0
    def get_subreddits_links_to_build_task(self):
        base_ = Base()
        extract_ = Extract()
        list_subreddits_data = base_.get_data_list_subreddits()
        downloaded_subs = base_.check_resume_file(file_path=self.resume_file)
        urls = extract_.get_urls_for_all_subreddits(subreddits=list_subreddits_data, \
            start_date=self.st_dt, end_date=self.end_dt)
        if len(downloaded_subs) > 0:
            urls = list(set(urls)- set(downloaded_subs))
            print("Already Dowloaded {} sub-reddits yet to download {} sub-reddits".format(len(downloaded_subs), len(urls)))
            print("Completed {}%".format(len(downloaded_subs)/len(urls)))

        return urls
Exemplo n.º 2
0
 def run_extraction(self):
     extract_ = Extract()
     base_ = Base()
     list_subreddits_data = base_.get_data_list_subreddits()
     downloaded_subs = base_.check_resume_file(file_path=self.resume_file)
     start_time = time.time()
     cost = 0
     urls = extract_.get_urls_for_all_subreddits(subreddits=list_subreddits_data, \
         start_date=self.st_dt, end_date=self.end_dt)
     if len(downloaded_subs) > 0:
         urls_ = list(set(urls) - set(downloaded_subs))
         print(
             "Already Dowloaded {} sub-reddits yet to download {} sub-reddits"
             .format(len(downloaded_subs), len(urls_)))
         print("Completed {}%".format(len(downloaded_subs) / len(urls_)))
         extract_.url_based_extraction(links=urls_, base_path=self.sav_path)
     else:
         extract_.url_based_extraction(links=urls, base_path=self.sav_path)
Exemplo n.º 3
0
 def run_extraction(self):
     extract_ = Extract()
     base_ = Base()
     list_subreddits_data = base_.get_data_list_subreddits()
     downloaded_subs = base_.check_resume_file(file_path=self.resume_file)
     if len(downloaded_subs) > 0:
         remianing_list = list(
             set(list_subreddits_data) - set(downloaded_subs))
         print(
             "Already Dowloaded {} sub-reddits yet to download {} sub-reddits"
             .format(len(downloaded_subs), len(remianing_list)))
         print("Completed {}%".format(
             len(downloaded_subs) / len(list_subreddits_data)))
         list_subreddits_data = remianing_list
     start_time = time.time()
     cost = 0
     for subreddit in list_subreddits_data:
         start_time, cost = extract_.start_extraction(subreddit=subreddit, start_date=self.st_dt, end_date=self.end_dt, \
             base_path=self.sav_path, start_time=start_time, total_cost=cost)
         print(cost)
         print(start_time)