Exemplos de StorageUtil em Python, exemplos de data_acquisition_framework.services.storage_util.StorageUtil em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: audio_pipeline.py Projeto: srajat84/data-acquisition-pipeline

 def __init__(self, store_uri, download_func=None, settings=None):
     super().__init__(store_uri, download_func, settings)
     if not os.path.exists(download_path):
         os.system("mkdir " + download_path)
     self.archive_list = {}
     self.metadata_creator = MediaMetadata()
     self.storage_util = StorageUtil()

Exemplo n.º 2

0

Exibir arquivo

 def setUp(self, mock_load_storage_config):
     data = {
         "bucket": "ekstepspeechrecognition-dev",
         "channel_blob_path": "scrapydump/refactor_test",
         "archive_blob_path": "archive",
         "scraped_data_blob_path": "scraped",
         "channels_file_blob_path": "channels"
     }
     mock_load_storage_config.return_value = data
     self.storage_util = StorageUtil()
     self.storage_config = data

Exemplo n.º 3

0

Exibir arquivo

Arquivo: youtube_api.py Projeto: srajat84/data-acquisition-pipeline

    def __init__(self, youtube):
        self.youtube = youtube
        self.storage_util = StorageUtil()

        self.TYPE = "channel"
        self.MAX_PAGE_RESULT = 50

        config = load_config_file("youtube_api_config.json")
        num_pages, num_results = self.__calculate_pages(config["max_results"])
        self.max_results = num_results
        self.pages = num_pages
        self.rel_language = config["language_code"]
        self.__set_keywords(config)
        self.pages_exhausted = False

Exemplo n.º 4

0

Exibir arquivo

Arquivo: youtube_api.py Projeto: srajat84/data-acquisition-pipeline

class YoutubeChannelCollector:

    def __init__(self, youtube):
        self.youtube = youtube
        self.storage_util = StorageUtil()

        self.TYPE = "channel"
        self.MAX_PAGE_RESULT = 50

        config = load_config_file("youtube_api_config.json")
        num_pages, num_results = self.__calculate_pages(config["max_results"])
        self.max_results = num_results
        self.pages = num_pages
        self.rel_language = config["language_code"]
        self.__set_keywords(config)
        self.pages_exhausted = False

    def __set_keywords(self, config):
        words_to_include = "|".join(config["keywords"])
        words_to_ignore = " ".join(["-" + word_to_ignore for word_to_ignore in config["words_to_ignore"]])
        self.keywords = ['in', config["language"], words_to_include, words_to_ignore]
        self.query_keywords = config['keywords']
        self.keyword_prefix = 'in {}'.format(config['language'])

    def __calculate_pages(self, max_results):
        if max_results <= self.MAX_PAGE_RESULT:
            num_results = max_results
            num_pages = 1
        else:
            num_pages = int(max_results // self.MAX_PAGE_RESULT)
            num_results = self.MAX_PAGE_RESULT
            if not max_results % self.MAX_PAGE_RESULT == 0:
                num_pages += 1
        return num_pages, num_results

    def __youtube_api_call_for_channel_search(self, token):
        return self.youtube.search().list(part="id,snippet", type=self.TYPE, q=' '.join(
            self.keywords), maxResults=self.max_results, relevanceLanguage=self.rel_language, pageToken=token).execute()

    def __get_page_channels(self):
        token = self.storage_util.get_token_from_local()
        results = self.__youtube_api_call_for_channel_search(token)
        page_channels = {}
        for item in results['items']:
            title = item['snippet']['channelTitle']
            title = title.replace("'", "") \
                .replace(" ", "_") \
                .replace(',', '_') \
                .replace('/', '_') \
                .replace('\\', '_') \
                .replace('.', '_') \
                .replace('$', '_')
            page_channels['https://www.youtube.com/channel/' +
                          item['snippet']['channelId']] = title
        if 'nextPageToken' in results:
            next_token = results['nextPageToken']
            self.storage_util.set_token_in_local(next_token)
        else:
            self.pages_exhausted = True
        return page_channels

    def get_urls(self):
        complete_channels = {}
        for _ in range(self.pages):
            if self.pages_exhausted:
                break
            page_channels = self.__get_page_channels()
            complete_channels.update(page_channels)
        return complete_channels

    def youtube_api_call_for_cc_video_search(self, keyword, token):
        return self.youtube.search().list(part="id,snippet", type='video', q=keyword, maxResults=self.max_results,
                                          relevanceLanguage=self.rel_language,
                                          videoLicense='creativeCommon',
                                          pageToken=token).execute()

    def __get_page_cc_videos(self, keyword, token):
        results = self.youtube_api_call_for_cc_video_search(keyword + " " + self.keyword_prefix, token)
        page_channels = {}
        next_token = None
        for item in results['items']:
            title = item['snippet']['channelTitle']
            title = title.replace("'", "") \
                .replace(" ", "_") \
                .replace(',', '_') \
                .replace('/', '_') \
                .replace('\\', '_') \
                .replace('.', '_') \
                .replace('$', '_')
            channel_id = item['snippet']['channelId']
            if channel_id == 'UCmyKnNRH0wH-r8I-ceP-dsg' or channel_id == 'UCcTKQnC3lRA4aira95_a1pw':
                continue
            page_channels['https://www.youtube.com/channel/' +
                          channel_id] = title
        if 'nextPageToken' in results:
            next_token = results['nextPageToken']
            # self.storage_util.set_token_in_local(next_token)
        else:
            self.pages_exhausted = True
        return page_channels, next_token

    def get_cc_video_channels(self):
        complete_channels = {}
        token_from_local = self.storage_util.get_token_from_local()
        if "{" not in token_from_local:
            token_from_local = "{}"
        gl_token = ast.literal_eval(token_from_local)
        tmp_gl_token = {}
        for keyword in self.query_keywords:
            if keyword in gl_token:
                token = gl_token[keyword]
            else:
                token = ''
            for _ in range(self.pages):
                if self.pages_exhausted:
                    self.pages_exhausted = False
                    break
                page_channels, token = self.__get_page_cc_videos(keyword, token)
                complete_channels.update(page_channels)
            tmp_gl_token[keyword] = token
        self.storage_util.set_token_in_local(str(tmp_gl_token))
        return complete_channels

Exemplo n.º 5

0

Exibir arquivo

Arquivo: youtube_util.py Projeto: srajat84/data-acquisition-pipeline

 def __init__(self):
     self.t_duration = 0
     self.storage_util = StorageUtil()
     self.youtube_dl_service = YoutubeDL()
     self.youtube_api_service = YoutubeApiUtils()

Exemplo n.º 6

0

Exibir arquivo

Arquivo: youtube_util.py Projeto: srajat84/data-acquisition-pipeline

class YoutubeUtil:
    def __init__(self):
        self.t_duration = 0
        self.storage_util = StorageUtil()
        self.youtube_dl_service = YoutubeDL()
        self.youtube_api_service = YoutubeApiUtils()

    def create_channel_file(self, source_channel_dict):
        if not (os.path.exists(channels_path)):
            os.mkdir(channels_path)

        for channel_url in source_channel_dict.keys():
            source_channel_dict[channel_url] = str(
                source_channel_dict[channel_url]).replace(' ', '_')
            # if "playlist?list=" in channel_url:
            #     pass
            # else:
            channel_id = channel_url.split('/')[-1]
            id_name_join = channel_id + '__' + source_channel_dict[channel_url]
            source_channel_file = channels_path + id_name_join + '.txt'

            is_downloaded = self.storage_util.get_videos_of_channel(
                id_name_join)

            if not is_downloaded:
                if youtube_service_to_use == YoutubeService.YOUTUBE_DL:
                    if only_creative_commons:
                        tmps_videos_list = self.youtube_dl_service.get_videos(
                            channel_url)
                        videos_list = []
                        for video in tmps_videos_list:
                            if 'Creative Commons' == self.youtube_api_service.get_license_info(
                                    video):
                                videos_list.append(video)
                    else:
                        videos_list = self.youtube_dl_service.get_videos(
                            channel_url)
                else:
                    videos_list = self.youtube_api_service.get_videos(
                        channel_id)
                tmp_videos_list = []
                with open(source_channel_file, 'w') as channel_file:
                    for video_id in videos_list:
                        if video_id not in tmp_videos_list:
                            channel_file.write(video_id + "\n")
                            tmp_videos_list.append(video_id)
                self.storage_util.upload(
                    source_channel_file,
                    self.storage_util.get_channel_file_upload_path(
                        id_name_join))
            else:
                if only_creative_commons:
                    tmp_videos_list = []
                    with open(source_channel_file, 'r') as channel_file:
                        for video_id in channel_file.read().splitlines():
                            if 'Creative Commons' == self.youtube_api_service.get_license_info(
                                    video_id):
                                tmp_videos_list.append(video_id)
                    with open(source_channel_file, 'w') as channel_file:
                        channel_file.writelines(
                            [video_id + "\n" for video_id in tmp_videos_list])

    def download_files(self, channel_name, file_name, batch_list):
        archive_path = archives_path.replace('<source>', channel_name)
        with ThreadPoolExecutor(max_workers=1) as executor:
            futures = []
            for video_id in batch_list:
                futures.append(
                    executor.submit(self.youtube_dl_service.youtube_download,
                                    video_id, archive_path, download_path))
            for future in as_completed(futures):
                remove_video_flag, video_id = future.result()
                if remove_video_flag:
                    remove_rejected_video(file_name, video_id)

    def get_license_info(self, video_id):
        # return "Creative Commons"
        return self.youtube_api_service.get_license_info(video_id)

    def get_channels(self):
        if only_creative_commons:
            return self.youtube_api_service.get_cc_video_channels()
        return self.youtube_api_service.get_channels()

    def get_video_info(self, file, channel_name, filemode_data, channel_id):
        video_id = file.replace(download_path, "").split('file-id')[-1][:-4]
        video_url_prefix = 'https://www.youtube.com/watch?v='
        channel_url_prefix = 'https://www.youtube.com/channel/'
        source_url = video_url_prefix + video_id
        video_duration = int(
            file.replace(download_path, "").split('file-id')[0]) / 60
        if mode == 'file':
            licence = get_license(filemode_data, video_id)
            if licence == "":
                licence = self.get_license_info(video_id)
        else:
            if only_creative_commons:
                licence = "Creative Commons"
            else:
                licence = self.get_license_info(video_id)

        video_info = {
            'duration': video_duration,
            'source': channel_name,
            'raw_file_name': file.replace(download_path, ""),
            'name':
            get_speaker(filemode_data, video_id) if mode == 'file' else None,
            'gender':
            get_gender(filemode_data, video_id) if mode == 'file' else None,
            'source_url': source_url,
            'license': licence
        }
        self.t_duration += video_duration
        logging.info('$$$$$$$    ' + str(self.t_duration // 60) + '   $$$$$$$')
        if mode == "channel":
            video_info['source_website'] = channel_url_prefix + channel_id
        return video_info

    def validate_mode_and_get_result(self):
        scraped_data = None
        if mode == "file":
            videos_file_path = self.storage_util.get_videos_file_path_in_bucket(
                source_name)
            if self.storage_util.check(videos_file_path):
                self.storage_util.download(videos_file_path,
                                           source_name + ".csv")
                logging.info(
                    str("Source scraped file has been downloaded from bucket to local path..."
                        ))
                scraped_data = create_channel_file_for_file_mode(
                    source_name + ".csv", file_url_name_column)
            else:
                logging.error(
                    str("{0} File doesn't exists on the given location: {1}".
                        format(source_name + ".csv", videos_file_path)))
        elif mode == "channel":
            self.get_channels_from_source()
        else:
            logging.error("Invalid mode")

        for channel_file in glob.glob(channels_path + '*.txt'):
            yield mode, channel_file.replace(channels_path, ''), scraped_data

    def get_channels_from_source(self):
        if is_channel_from_config():
            source_channel_dict = channel_url_dict
        else:
            source_channel_dict = self.get_channels()
        self.create_channel_file(source_channel_dict)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: audio_pipeline.py Projeto: srajat84/data-acquisition-pipeline

class AudioPipeline(FilesPipeline):

    def __init__(self, store_uri, download_func=None, settings=None):
        super().__init__(store_uri, download_func, settings)
        if not os.path.exists(download_path):
            os.system("mkdir " + download_path)
        self.archive_list = {}
        self.metadata_creator = MediaMetadata()
        self.storage_util = StorageUtil()

    def file_path(self, request, response=None, info=None, **kwargs):
        file_name: str = request.url.split("/")[-1]
        file_name = file_name.replace("%", "_").replace(",", "_")
        return file_name

    def process_item(self, item, spider):
        if type(item) is LicenseItem:
            if item["key_name"] == "html_page":
                file_name = "license_{0}.txt".format(item["source"])
                self.storage_util.write_license_to_local(file_name, item['content'])
                self.storage_util.upload_license(file_name, item["source"], item["language"])
                raise DropItem()
            elif item["key_name"] == "creativecommons":
                file_name = "license_{0}.txt".format(item["source"])
                content = "creative commons => " + item["file_urls"][0]
                self.storage_util.write_license_to_local(file_name, content)
                self.storage_util.upload_license(file_name, item["source"], item["language"])
                raise DropItem()
            elif item["key_name"] == "document":
                return super().process_item(item, spider)
            else:
                exception_message = "Invalid key_name used for license item {}".format(item["key_name"])
                logging.info(exception_message)
                raise DropItem(exception_message)
        else:
            return super().process_item(item, spider)

    def item_completed(self, results, item, info):
        duration_in_seconds = 0
        with suppress(KeyError):
            ItemAdapter(item)[self.files_result_field] = [x for ok, x in results if ok]
        if self.is_download_success(item):
            file_stats = item['files'][0]
            file = file_stats['path']
            url = file_stats['url']
            media_file_path = download_path + file
            if os.path.isfile(media_file_path):
                logging.info(str("***File {0} downloaded ***".format(file)))
                if type(item) is LicenseItem:
                    self.upload_license_to_bucket(item, media_file_path)
                else:
                    duration_in_seconds = self.upload_file_to_storage(file, item, media_file_path, url)
            else:
                logging.info(str("***File {0} not downloaded ***".format(item["title"])))
        if type(item) is not LicenseItem:
            item["duration"] = duration_in_seconds
        return item

    def upload_file_to_storage(self, file, item, media_file_path, url):
        duration_in_seconds = 0
        self.storage_util.populate_local_archive(item["source"], url)
        try:
            duration_in_seconds = self.extract_metadata(media_file_path, url, item)
            self.storage_util.upload_media_and_metadata_to_bucket(item["source"], media_file_path,
                                                                  item["language"])
            self.storage_util.upload_archive_to_bucket(item["source"], item["language"])
            logging.info(str("***File {0} uploaded ***".format(file)))
        except Exception as exception:
            logging.error(exception)
            os.remove(media_file_path)
        return duration_in_seconds

    def upload_license_to_bucket(self, item, media_file_path):
        self.storage_util.upload_license(media_file_path, item["source"], item["language"])

    def get_media_requests(self, item, info):
        urls = ItemAdapter(item).get(self.files_urls_field, [])
        if item["source"] not in self.archive_list:
            self.archive_list[item["source"]] = []
        if not os.path.isdir(archives_base_path + item["source"]):
            self.storage_util.retrieve_archive_from_bucket(item["source"], item["language"])
            self.archive_list[item["source"]] = self.storage_util.retrieve_archive_from_local(item["source"])
        return [Request(u) for u in urls if u not in self.archive_list[item["source"]]]

    def is_download_success(self, item):
        return len(item['files']) > 0

    def extract_metadata(self, file, url, item):
        file_format = get_file_format(file)
        meta_file_name = file.replace(file_format, "csv")
        media_info, duration_in_seconds = get_media_info(file, item['source'], item['language'], item['source_url'],
                                                         item['license_urls'], url)
        metadata = self.metadata_creator.create_metadata(media_info)
        metadata_df = pd.DataFrame([metadata])
        metadata_df.to_csv(meta_file_name, index=False)
        return duration_in_seconds

Exemplo n.º 8

0

Exibir arquivo

class TestStorageUtil(TestCase):

    @patch('data_acquisition_framework.services.storage_util.load_config_file')
    def setUp(self, mock_load_storage_config):
        data = {
            "bucket": "ekstepspeechrecognition-dev",
            "channel_blob_path": "scrapydump/refactor_test",
            "archive_blob_path": "archive",
            "scraped_data_blob_path": "scraped",
            "channels_file_blob_path": "channels"
        }
        mock_load_storage_config.return_value = data
        self.storage_util = StorageUtil()
        self.storage_config = data

    @patch('data_acquisition_framework.services.storage_util.set_gcs_credentials')
    def test_set_gcs_creds_called_with_proper_input(self, mock_set_gcs_credentials):
        input_value = '{"Credentials": {"name":"hello"}}'
        cred = json.loads('{"name":"hello"}')

        self.storage_util.set_gcs_creds(input_value)

        mock_set_gcs_credentials.assert_called_once_with(cred)

    @patch('data_acquisition_framework.services.storage_util.set_gcs_credentials')
    def test_set_gcs_creds_throw_input_type_error(self, mock_gcs_creds):
        input_value = {"Credentials": {"name": "hello"}}

        with self.assertRaises(TypeError):
            self.storage_util.set_gcs_creds(input_value)

    @patch('data_acquisition_framework.services.storage_util.set_gcs_credentials')
    def test_set_gcs_creds_throw_input_not_found_error(self, mock_gcs_creds):
        input_value = '{"name": "hello"}'

        with self.assertRaises(KeyError):
            self.storage_util.set_gcs_creds(input_value)

    @patch('data_acquisition_framework.services.storage_util.upload_blob')
    def test_upload_success(self, mock_upload_blob):
        file_to_upload = "test/a.mp4"
        location_to_upload = "test/a/a.mp4"

        self.storage_util.upload(file_to_upload, location_to_upload)

        mock_upload_blob.assert_called_once_with(self.storage_config["bucket"], file_to_upload, location_to_upload)

    @patch('data_acquisition_framework.services.storage_util.download_blob')
    def test_download(self, mock_download_blob):
        file_to_download = "test/a/a.mp4"
        download_location = "test/a.mp4"

        self.storage_util.download(file_to_download, download_location)

        mock_download_blob.assert_called_once_with(self.storage_config["bucket"], file_to_download, download_location)

    @patch('data_acquisition_framework.services.storage_util.check_blob')
    def test_check(self, mock_check_blob):
        mock_check_blob.return_value = True
        file_to_check = "test/a.mp4"
        result = self.storage_util.check(file_to_check)

        mock_check_blob.assert_called_once_with(self.storage_config['bucket'], file_to_check)
        self.assertTrue(result)

    def test_get_archive_file_bucket_path_with_language(self):
        source = "test"
        language = "tamil"
        expected = self.storage_config["channel_blob_path"] + "/" + language + '/' + self.storage_config[
            "archive_blob_path"] + '/' + source + '/' + self.storage_util.archive_file_name

        tmp_channel_blob_path = self.storage_util.channel_blob_path
        self.storage_util.channel_blob_path = self.storage_config["channel_blob_path"] + "/<language>"

        path = self.storage_util.get_archive_file_bucket_path(source, language)

        self.storage_util.channel_blob_path = tmp_channel_blob_path
        self.assertEqual(expected, path)

    def test_get_archive_file_bucket_path_without_language(self):
        source = "test"
        language = "tamil"
        expected = self.storage_config["channel_blob_path"] + '/' + self.storage_config[
            "archive_blob_path"] + '/' + source + '/' + self.storage_util.archive_file_name

        path = self.storage_util.get_archive_file_bucket_path(source, language)

        self.assertEqual(expected, path)

    def test_get_channel_file_upload_bucket_path_with_language(self):
        source = "test"
        language = "tamil"
        expected = self.storage_config[
                       'channel_blob_path'] + "/" + language + '/' + self.storage_config[
                       'channels_file_blob_path'] + '/' + source + '/videos_list.txt'

        tmp_channel_blob_path = self.storage_util.channel_blob_path
        self.storage_util.channel_blob_path = self.storage_config["channel_blob_path"] + "/<language>"

        path = self.storage_util.get_channel_file_upload_path(source, language)

        self.storage_util.channel_blob_path = tmp_channel_blob_path
        self.assertEqual(expected, path)

    def test_get_channel_file_upload_bucket_path_without_language(self):
        source = "test"
        language = "tamil"
        expected = self.storage_config[
                       'channel_blob_path'] + '/' + self.storage_config[
                       'channels_file_blob_path'] + '/' + source + '/videos_list.txt'

        path = self.storage_util.get_channel_file_upload_path(source, language)

        self.assertEqual(expected, path)

    @patch('data_acquisition_framework.services.storage_util.check_blob')
    @patch('data_acquisition_framework.services.storage_util.download_blob')
    def test_retrieve_archive_from_bucket_if_exists(self, mock_download_blob, mock_check_blob):
        mock_check_blob.return_value = True
        if os.path.exists(archives_base_path):
            os.system('rm -rf ' + archives_base_path)
        source = "test"
        language = "tamil"
        archive_bucket_path = self.storage_util.get_archive_file_bucket_path(source, language)

        def side_effect(bucket, source_file, destination):
            os.system("touch {0}".format(destination))

        mock_download_blob.side_effect = side_effect

        self.storage_util.retrieve_archive_from_bucket(source, language)

        self.assertTrue(os.path.exists(archives_base_path))
        self.assertTrue(os.path.exists(archives_base_path + source))
        self.assertTrue(os.path.exists(archives_path.replace("<source>", source)))
        mock_check_blob.assert_called_once_with(self.storage_config['bucket'],
                                                archive_bucket_path)
        mock_download_blob.assert_called_once_with(self.storage_config['bucket'], archive_bucket_path,
                                                   archives_path.replace("<source>", source))
        if os.path.exists(archives_base_path):
            os.system('rm -rf ' + archives_base_path)

    @patch('data_acquisition_framework.services.storage_util.check_blob')
    @patch('data_acquisition_framework.services.storage_util.download_blob')
    def test_retrieve_archive_from_bucket_if_not_exists(self, mock_download_blob, mock_check_blob):
        mock_check_blob.return_value = False
        if os.path.exists(archives_base_path):
            os.system('rm -rf ' + archives_base_path)
        source = "test"
        language = "tamil"
        archive_bucket_path = self.storage_util.get_archive_file_bucket_path(source, language)

        self.storage_util.retrieve_archive_from_bucket(source, language)

        self.assertTrue(os.path.exists(archives_base_path))
        self.assertTrue(os.path.exists(archives_base_path + source))
        self.assertTrue(os.path.exists(archives_path.replace("<source>", source)))
        mock_check_blob.assert_called_once_with(self.storage_config['bucket'],
                                                archive_bucket_path)
        mock_download_blob.assert_not_called()

        os.system('rm -rf ' + archives_base_path)

    def test_populate_local_archive(self):
        url = "http://gc/a.mp4"
        source = "test"

        if not os.path.exists(archives_base_path):
            os.system('mkdir ' + archives_base_path)
        if not os.path.exists(archives_base_path + source + "/"):
            os.system('mkdir {0}/{1}'.format(archives_base_path, source))

        self.storage_util.populate_local_archive(source, url)

        result = self.storage_util.retrieve_archive_from_local(source)

        self.assertEqual([url], result)

        os.system('rm -rf ' + archives_base_path)

    def test_retrieve_archive_from_local_if_exists(self):
        source = "test"
        url = "http://gc/a.mp4"
        archive_path = archives_path.replace('<source>', source)

        if not os.path.exists(archives_base_path):
            os.system('mkdir ' + archives_base_path)
        if not os.path.exists(archives_base_path + source + "/"):
            os.system('mkdir {0}/{1}'.format(archives_base_path, source))
        if not os.path.exists(archive_path):
            os.system('echo ' + url + '>' + archive_path)

        self.assertEqual([url], self.storage_util.retrieve_archive_from_local(source))

        os.system('rm -rf ' + archives_base_path)

    def test_retrieve_archive_from_local_if_not_exists(self):
        source = "test"
        archive_path = archives_path.replace('<source>', source)

        if os.path.exists(archive_path):
            os.system('rm -rf ' + archive_path)

        self.assertEqual([], self.storage_util.retrieve_archive_from_local(source))

    @patch('data_acquisition_framework.services.storage_util.upload_blob')
    def test_upload_archive_to_bucket(self, mock_upload):
        source = "test"

        test_archive_bucket_path = self.storage_util.get_archive_file_bucket_path(source)
        self.storage_util.upload_archive_to_bucket(source)

        mock_upload.assert_called_with(self.storage_config['bucket'], archives_path.replace("<source>", source),
                                       test_archive_bucket_path)

    @patch('data_acquisition_framework.services.storage_util.upload_blob')
    def test_upload_media_and_metadata_to_bucket(self, mock_upload_blob):
        metadata_file_path = download_path + "a.csv"
        media_file_path = download_path + "a.mp4"
        os.system("mkdir " + download_path)
        os.system("touch " + media_file_path)
        os.system("touch " + metadata_file_path)
        source = "test"
        media_bucket_path = self.storage_config['channel_blob_path'] + '/' + source + '/' + media_file_path.replace(
            download_path,
            "")
        meta_bucket_path = self.storage_config['channel_blob_path'] + '/' + source + '/' + metadata_file_path.replace(
            download_path,
            "")

        self.storage_util.upload_media_and_metadata_to_bucket(source, media_file_path)
        media_call = call(self.storage_config['bucket'], media_file_path, media_bucket_path)
        meta_call = call(self.storage_config['bucket'], metadata_file_path, meta_bucket_path)

        mock_upload_blob.assert_has_calls([media_call, meta_call])
        self.assertFalse(os.path.exists(media_file_path))
        self.assertFalse(os.path.exists(metadata_file_path))

        os.system("rm -rf " + download_path)

    @patch('data_acquisition_framework.services.storage_util.upload_blob')
    def test_upload_license(self, mock_upload_blob):
        license_path = download_path + "a.txt"
        os.system("mkdir " + download_path)
        os.system("touch " + license_path)
        source = "test"
        license_bucket_path = self.storage_config[
                                  'channel_blob_path'] + '/' + source + '/' + 'license/' + license_path.replace(
            download_path,
            "")

        self.storage_util.upload_license(license_path, source)

        mock_upload_blob.assert_called_once_with(self.storage_config['bucket'], license_path, license_bucket_path)
        self.assertFalse(os.path.exists(license_path))

        os.system("rm -rf " + download_path)

    def test_get_token_path(self):
        expected = self.storage_config['channel_blob_path'] + '/' + self.storage_util.token_file_name

        result = self.storage_util.get_token_path()

        self.assertEqual(expected, result)

    @patch('data_acquisition_framework.services.storage_util.upload_blob')
    def test_upload_token_to_bucket_if_exists(self, mock_upload_blob):
        token_path = "token.txt"
        os.system("touch " + token_path)

        self.storage_util.upload_token_to_bucket()

        token_bucket_path = self.storage_util.get_token_path()
        mock_upload_blob.assert_called_once_with(self.storage_config['bucket'], token_path, token_bucket_path)

        os.system("rm " + token_path)

    @patch('data_acquisition_framework.services.storage_util.upload_blob')
    def test_no_upload_token_to_bucket_if_not_exists(self, mock_upload_blob):
        token_path = "token.txt"
        if os.path.exists(token_path):
            os.system("rm " + token_path)

        self.storage_util.upload_token_to_bucket()

        mock_upload_blob.assert_not_called()

    @patch('data_acquisition_framework.services.storage_util.check_blob')
    @patch('data_acquisition_framework.services.storage_util.download_blob')
    def test_get_token_from_bucket_if_exists(self, mock_download_blob, mock_check_blob):
        mock_check_blob.return_value = True
        token_bucket_path = self.storage_util.get_token_path()

        def side_effect(bucket, file_to_download, download_location):
            os.system("touch {0}".format(download_location))

        mock_download_blob.side_effect = side_effect

        self.storage_util.get_token_from_bucket()

        mock_download_blob.assert_called_once_with(self.storage_config['bucket'], token_bucket_path, 'token.txt')
        self.assertTrue(os.path.exists("token.txt"))

        os.system("rm token.txt")

    @patch('data_acquisition_framework.services.storage_util.check_blob')
    @patch('data_acquisition_framework.services.storage_util.download_blob')
    def test_get_token_from_bucket_if_not_exists(self, mock_download_blob, mock_check_blob):
        mock_check_blob.return_value = False

        self.storage_util.get_token_from_bucket()

        mock_download_blob.assert_not_called()
        self.assertTrue(os.path.exists("token.txt"))

        os.system("rm token.txt")

    def test_get_token_from_local_if_exists(self):
        os.system("echo 'hello' > token.txt")
        expected = "hello"

        result = self.storage_util.get_token_from_local()

        self.assertEqual(expected, result)
        os.system("rm token.txt")

    def test_set_token_in_local(self):
        expected = ""

        result = self.storage_util.get_token_from_local()

        self.assertEqual(expected, result)

    def test_get_videos_file_path_in_bucket(self):
        source = "test"
        expected = self.storage_config["channel_blob_path"] + '/' + self.storage_config[
            "scraped_data_blob_path"] + '/' + source + '.csv'

        result = self.storage_util.get_videos_file_path_in_bucket(source)

        self.assertEqual(expected, result)

    def test_clear_required_directories_with_remove_downloads(self):
        os.system("mkdir " + download_path)
        os.system("mkdir " + channels_path)
        os.system("mkdir " + archives_base_path)

        self.storage_util.clear_required_directories()

        self.assertFalse(os.path.exists(download_path))
        self.assertFalse(os.path.exists(channels_path))
        self.assertFalse(os.path.exists(archives_base_path))

        os.system("rm -rf " + download_path)
        os.system("rm -rf " + channels_path)
        os.system("rm -rf " + archives_base_path)

    def test_clear_required_directories_with_create_downloads(self):
        os.system("rm -rf " + download_path)
        os.system("mkdir " + channels_path)
        os.system("mkdir " + archives_base_path)

        self.storage_util.clear_required_directories()

        self.assertTrue(os.path.exists(download_path))
        self.assertFalse(os.path.exists(channels_path))
        self.assertFalse(os.path.exists(archives_base_path))

        os.system("rm -rf " + download_path)
        os.system("rm -rf " + channels_path)
        os.system("rm -rf " + archives_base_path)

    def test_write_license_to_local(self):
        os.system("mkdir " + download_path)
        file_name = "a.txt"
        file_content = "hello"
        self.storage_util.write_license_to_local(file_name, file_content)

        file_path = download_path + file_name
        self.assertTrue(os.path.exists(file_path))
        f = open(file_path)
        self.assertEqual(file_content, f.read().rstrip())
        f.close()
        os.system("rm -rf " + download_path)

    def test_get_channel_videos_count(self):
        file_name = "test.txt"
        expected = 2
        channel_file_path = channels_path + file_name
        if not os.path.exists(channels_path):
            os.system("mkdir " + channels_path)
        with open(channel_file_path, 'w') as f:
            f.write("ab33cd" + "\n")
            f.write("ccdded")

        result = self.storage_util.get_channel_videos_count(file_name)

        self.assertEqual(expected, result)

        os.system('rm -rf ' + channels_path)

    def test_get_media_paths(self):
        file1 = download_path + "file1.mp4"
        file2 = download_path + "file2.mp4"
        if not os.path.exists(download_path):
            os.system("mkdir " + download_path)
        os.system("touch " + file1)
        os.system("touch " + file2)
        expected = [file1, file2]

        media_paths = self.storage_util.get_media_paths()

        self.assertEqual(expected, media_paths)

        os.system("rm -rf " + download_path)

    def test_get_videos_of_channel_when_present_in_bucket(self):
        with patch.object(self.storage_util, 'get_channel_file_upload_path') as mock_get_channel_file_upload_path:
            with patch.object(self.storage_util, 'check') as mock_check:
                with patch.object(self.storage_util, 'download') as mock_download:
                    mock_check.return_value = True
                    base_path = "test/test"
                    mock_get_channel_file_upload_path.return_value = base_path

                    id_name = "324234__hello"
                    flag = self.storage_util.get_videos_of_channel(id_name)

                    self.assertTrue(flag)
                    mock_get_channel_file_upload_path.assert_called_once_with(id_name)
                    mock_check.assert_called_once_with(base_path)
                    mock_download.assert_called_once_with(base_path, channels_path + id_name + ".txt")

    def test_get_videos_of_channel_when_not_present_in_bucket(self):
        with patch.object(self.storage_util, 'get_channel_file_upload_path') as mock_get_channel_file_upload_path:
            with patch.object(self.storage_util, 'check') as mock_check:
                with patch.object(self.storage_util, 'download') as mock_download:
                    mock_check.return_value = False
                    base_path = "test/test"
                    mock_get_channel_file_upload_path.return_value = base_path

                    id_name = "324234__hello"
                    flag = self.storage_util.get_videos_of_channel(id_name)

                    self.assertFalse(flag)
                    mock_get_channel_file_upload_path.assert_called_once_with(id_name)
                    mock_check.assert_called_once_with(base_path)
                    mock_download.assert_not_called()

Exemplo n.º 9

0

Exibir arquivo

Arquivo: youtube_api_pipeline.py Projeto: srajat84/data-acquisition-pipeline

 def __init__(self):
     self.storage_util = StorageUtil()
     self.youtube_util = YoutubeUtil()
     self.metadata_creator = MediaMetadata()
     self.batch_count = 0

Exemplo n.º 10

0

Exibir arquivo

Arquivo: youtube_api_pipeline.py Projeto: srajat84/data-acquisition-pipeline

class YoutubeApiPipeline(DataAcquisitionPipeline):
    def __init__(self):
        self.storage_util = StorageUtil()
        self.youtube_util = YoutubeUtil()
        self.metadata_creator = MediaMetadata()
        self.batch_count = 0

    def create_download_batch(self, item):
        return get_video_batch(item['channel_name'], item['filename'])

    def download_files(self, item, batch_list):
        self.youtube_util.download_files(item['channel_name'],
                                         item['filename'], batch_list)

    def extract_metadata(self, item, media_file_name, url=None):
        meta_file_name = get_meta_filename(media_file_name)
        video_info = self.youtube_util.get_video_info(media_file_name,
                                                      item['channel_name'],
                                                      item['filemode_data'],
                                                      item['channel_id'])
        metadata = self.metadata_creator.create_metadata(video_info)
        metadata_df = pd.DataFrame([metadata])
        metadata_df.to_csv(meta_file_name, index=False)

    def process_item(self, item, spider):
        self.batch_count = 0
        self.storage_util.retrieve_archive_from_bucket(item["channel_name"])
        channel_videos_count = self.storage_util.get_channel_videos_count(
            item['filename'])
        logging.info(
            str("Total channel count with valid videos is {0}".format(
                channel_videos_count)))
        self.batch_download(item)
        return item

    def video_batch_exists(self, batch_list):
        last_video_batch_count = len(batch_list)
        logging.info(
            str("Attempt to download videos with batch size of {0}".format(
                last_video_batch_count)))
        return last_video_batch_count > 0

    def batch_download(self, item):
        batch_list = self.create_download_batch(item)
        while self.video_batch_exists(batch_list):
            try:
                self.download_files(item, batch_list)
            except Exception as e:
                logging.error(e)
            finally:
                self.upload_files_to_storage(item)
            batch_list = self.create_download_batch(item)
        logging.info(
            str("Last Batch has no more videos to be downloaded,so finishing downloads..."
                ))
        logging.info(
            str("Total Uploaded files for this run was : {0}".format(
                self.batch_count)))

    def upload_files_to_storage(self, item):
        channel_name = item['channel_name']
        media_paths = self.storage_util.get_media_paths()
        media_files_count = len(media_paths)
        if media_files_count > 0:
            self.batch_count += media_files_count
            logging.info(
                str("Uploading {0} files to gcs bucket...".format(
                    media_files_count)))
            for file in media_paths:
                self.extract_metadata(item, file)
                self.storage_util.upload_media_and_metadata_to_bucket(
                    channel_name, file)
            self.storage_util.upload_archive_to_bucket(channel_name)
            logging.info(
                str("Uploaded files till now: {0}".format(self.batch_count)))