def retrieve_archive_from_local(self, source):
     if os.path.exists(archives_path.replace('<source>', source)):
         with open(archives_path.replace('<source>', source), 'r') as f:
             lines = f.readlines()
         return [line.replace('\n', '') for line in lines]
     else:
         logging.info("No archive.txt is found.....")
         return []
 def download_files(self, channel_name, file_name, batch_list):
     archive_path = archives_path.replace('<source>', channel_name)
     with ThreadPoolExecutor(max_workers=1) as executor:
         futures = []
         for video_id in batch_list:
             futures.append(
                 executor.submit(self.youtube_dl_service.youtube_download,
                                 video_id, archive_path, download_path))
         for future in as_completed(futures):
             remove_video_flag, video_id = future.result()
             if remove_video_flag:
                 remove_rejected_video(file_name, video_id)
def get_video_batch(source, source_file):
    source = source.replace('.txt', '')
    channel_file_name = channels_path + source_file
    archive_file_name = archives_path.replace('<source>', source)
    try:
        channel_videos = pd.read_csv(channel_file_name, header=None)
    except EmptyDataError:
        return []
    try:
        channel_archive = pd.read_csv(archive_file_name,
                                      delimiter=' ',
                                      header=None,
                                      encoding='utf-8')[1]
    except EmptyDataError:
        channel_archive = pd.DataFrame(columns=[1])

    video_batch = channel_videos[channel_videos.merge(
        channel_archive, left_on=0, right_on=1,
        how='left')[1].isnull()].head(batch_num)
    return video_batch[0].tolist()
 def retrieve_archive_from_bucket(self, source, language=""):
     archive_path = archives_path.replace('<source>', source)
     if not os.path.exists(archives_base_path):
         os.system('mkdir ' + archives_base_path)
     if not os.path.exists(archives_base_path + source + "/"):
         os.system('mkdir {0}/{1}'.format(archives_base_path, source))
     if self.check(self.get_archive_file_bucket_path(source, language)):
         self.download(self.get_archive_file_bucket_path(source, language),
                       archive_path)
         logging.info(
             str("Archive file has been downloaded from bucket {0} to local path..."
                 .format(self.bucket)))
         with open(archive_path, 'r') as f:
             num_downloaded = len(f.read().splitlines())
         logging.info(
             str("Count of Previously downloaded files are : {0}".format(
                 num_downloaded)))
     else:
         os.system('touch {0}'.format(archive_path))
         logging.info(
             "No Archive file has been found on bucket...Downloading all files..."
         )
 def upload_archive_to_bucket(self, source, language=""):
     archive_path = archives_path.replace('<source>', source)
     archive_bucket_path = self.get_archive_file_bucket_path(
         source, language)
     self.upload(archive_path, archive_bucket_path)
 def populate_local_archive(self, source, url):
     with open(archives_path.replace('<source>', source), 'a+') as f:
         f.write(url + '\n')