Пример #1
0
    def collect_ancillary_tweets(self, filter_name, method='twint'):
        athena_db = AthenaDatabase(database=self.athena_data,
                                   s3_output=self.s3_admin)

        filter_terms = athena_db.query_athena_and_get_result(
            query_string=FILTER_TERMS.format(name=filter_name))['track']

        new_videos_yesterday = Path(
            Path(__file__).parent, 'tmp', 'new_videos_yesterday.csv')
        Path(new_videos_yesterday).parent.mkdir(parents=True, exist_ok=True)
        new_videos_yesterday_file = athena_db.query_athena_and_download(
            query_string=NEW_VIDEOS_YESTERDAY, filename=new_videos_yesterday)
        yesterday = athena_db.query_athena_and_get_result(
            query_string=YESTERDAY)['yesterday']

        if method == 'twint':
            self.collect_user_tweets_twint(
                filter_terms=filter_terms,
                new_videos_yesterday_file=new_videos_yesterday_file)
            self.export_twint(yesterday=yesterday)
        else:
            self.collect_user_tweets_tweepy(
                filter_terms=filter_terms,
                new_videos_yesterday_file=new_videos_yesterday_file)
            self.export_tweepy(yesterday=yesterday)
Пример #2
0
def create_edges(min_users, timespan, final_date, end):
    s3 = boto3.resource('s3')
    athena_db = AthenaDatabase(database='internet_scholar',
                               s3_output='internet-scholar-admin')
    min_date = athena_db.query_athena_and_get_result(
        query_string=MIN_DATE)['min_date']
    min_date = datetime.strptime(min_date, '%Y-%m-%d').date()
    initial_date = final_date - timedelta(days=timespan - 1)
    while final_date <= end:
        print('Edges - {}'.format(str(final_date)))
        if initial_date >= min_date:
            edges = athena_db.query_athena_and_download(
                query_string=SELECT_EDGES.format(
                    initial_date=str(initial_date),
                    final_date=str(final_date),
                    min_users=min_users),
                filename='edges.csv')
            compressed_file = compress(filename=edges, delete_original=True)
            s3_filename = "youtube_graph_edge/min_users={min_users}/" \
                          "timespan={timespan}/final_date={final_date}/edges.csv.bz2".format(
                min_users=min_users,
                timespan=timespan,
                final_date=str(final_date))
            s3.Bucket('internet-scholar').upload_file(str(compressed_file),
                                                      s3_filename)
        final_date = final_date + timedelta(days=1)
        initial_date = initial_date + timedelta(days=1)
    athena_db.query_athena_and_wait(
        query_string='drop table if exists youtube_graph_edge')
    athena_db.query_athena_and_wait(
        query_string=CREATE_YOUTUBE_GRAPH_EDGE.format(
            s3_data='internet-scholar'))
    athena_db.query_athena_and_wait(
        query_string='MSCK REPAIR TABLE youtube_graph_edge')
Пример #3
0
    def update_table_youtube_twitter_addition(self):
        athena_db = AthenaDatabase(database=self.athena_data,
                                   s3_output=self.s3_admin)
        new_videos_filename = Path(
            Path(__file__).parent, 'tmp', 'new_videos_today.csv')
        Path(new_videos_filename).parent.mkdir(parents=True, exist_ok=True)
        new_videos = athena_db.query_athena_and_download(
            query_string=NEW_VIDEOS_TODAY, filename=new_videos_filename)

        new_videos_compressed = Path(
            Path(__file__).parent, 'tmp', 'new_videos.csv.gz')
        with open(str(new_videos), 'rt') as f_in:
            with gzip.open(str(new_videos_compressed), 'wt') as f_out:
                reader = csv.DictReader(f_in)
                for video_id in reader:
                    f_out.write(video_id['id'] + '\n')

        s3 = boto3.resource('s3')
        s3_filename = "youtube_twitter_addition/creation_date={}/video_ids.csv.gz".format(
            datetime.utcnow().strftime("%Y-%m-%d"))
        s3.Bucket(self.s3_data).upload_file(str(new_videos_compressed),
                                            s3_filename)

        athena_db.query_athena_and_wait(
            query_string="MSCK REPAIR TABLE youtube_twitter_addition")
 def repair_table_metadata(self):
     print("Going to repair table metadata...")
     athena_db = AthenaDatabase(database=self.config['aws']['athena'],
                                s3_output=self.bucket)
     athena_db.query_athena_and_wait(
         query_string="MSCK REPAIR TABLE metadata")
     self.repair_metadata = False
     print("Done.")
    def add_timeframe_section_to_s3_path(self):
        athena_db = AthenaDatabase(database=self.config['aws']['athena'],
                                   s3_output=self.bucket)

        # self.repair_table_metadata()
        # metadata_records = athena_db.query_athena_and_download(query_string="select distinct service, project, "
        #                                                                     "speaker, performance_date, speaker_type "
        #                                                                     "from metadata "
        #                                                                     "order by service, project, speaker, "
        #                                                                     "performance_date, speaker_type",
        #                                                        filename='metadata_records.csv')
        # with open(metadata_records) as metadata_file:
        #     metadata_reader = csv.DictReader(metadata_file)
        #     for row in metadata_reader:
        #         print(f"{row['service']}/{row['project']}/{row['speaker']}/{row['performance_date']}/{row['speaker_type']}")
        #         move_data_in_s3(
        #             bucket_name=self.bucket,
        #             origin=f"transcript/service={row['service']}/project={row['project']}/speaker={row['speaker']}/performance_date={row['performance_date']}/part=1/timeframe=4/timesection=1/speaker_type={row['speaker_type']}/transcript.json.bz2",
        #             destination=f"transcript/service={row['service']}/project={row['project']}/speaker={row['speaker']}/performance_date={row['performance_date']}/part=1/speaker_type={row['speaker_type']}/timeframe=4/section=1/transcript.json.bz2"
        #         )

        self.repair_table_word()
        words_records = athena_db.query_athena_and_download(
            query_string="select distinct project, speaker, "
            "performance_date, service, protagonist "
            "from word "
            "order by project, speaker, performance_date, "
            "service, protagonist",
            filename='words_records.csv')
        with open(words_records) as words_file:
            words_reader = csv.DictReader(words_file)
            for row in words_reader:
                move_data_in_s3(
                    bucket_name=self.bucket,
                    origin=
                    f"word/project={row['project']}/speaker={row['speaker']}/performance_date={row['performance_date']}/part=1/service={row['service']}/protagonist={row['protagonist']}/word.json.bz2",
                    destination=
                    f"word/project={row['project']}/speaker={row['speaker']}/performance_date={row['performance_date']}/part=1/service={row['service']}/protagonist={row['protagonist']}/timeframe=4/section=1/word.json.bz2"
                )
Пример #6
0
    def collect_related_video(self, region_code, creation_date=None):
        athena_db = AthenaDatabase(database=self.athena_data,
                                   s3_output=self.s3_admin)

        trending_filename = Path(Path(__file__).parent, 'tmp', 'trending.csv')
        Path(trending_filename).parent.mkdir(parents=True, exist_ok=True)

        if creation_date is None:
            query_string = TRENDING_VIDEOS.format(
                creation_date=(
                    date.today() -
                    timedelta(days=self.NUMBER_OF_DAYS)).strftime("%Y-%m-%d"),
                number_of_videos=self.NUMBER_OF_VIDEOS)
        else:
            query_string = TRENDING_VIDEOS.format(
                creation_date=(
                    datetime.strptime(creation_date, '%Y-%m-%d') -
                    timedelta(days=self.NUMBER_OF_DAYS)).strftime("%Y-%m-%d"),
                number_of_videos=self.NUMBER_OF_VIDEOS)

        trending_videos = athena_db.query_athena_and_download(
            query_string=query_string, filename=trending_filename)

        with open(trending_videos, newline='', encoding="utf8") as csv_reader:
            output_json = Path(
                Path(__file__).parent, 'tmp', 'youtube_related_video.json')
            Path(output_json).parent.mkdir(parents=True, exist_ok=True)

            with open(output_json, 'w') as json_writer:
                reader = csv.DictReader(csv_reader)
                current_key = 0
                try:
                    youtube = googleapiclient.discovery.build(
                        serviceName="youtube",
                        version="v3",
                        developerKey=self.credentials[current_key]
                        ['developer_key'],
                        cache_discovery=False)
                except UnknownApiNameOrVersion as e:
                    service = read_dict_from_url(
                        url=
                        "https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest"
                    )
                    youtube = googleapiclient.discovery.build_from_document(
                        service=service,
                        developerKey=self.credentials[current_key]
                        ['developer_key'])
                num_videos = 0
                if creation_date is None:
                    max_results = self.NUMBER_OF_RELATED_VIDEOS
                    part = 'id'
                else:
                    part = 'snippet'
                    max_results = self.NUMBER_OF_RELATED_VIDEOS * 3
                    if max_results > 50:
                        max_results = 50
                for trending_video in reader:
                    service_unavailable = 0
                    connection_reset_by_peer = 0
                    no_response = True
                    response = dict()
                    while no_response:
                        try:
                            response = youtube.search().list(
                                part=part,
                                type='video',
                                regionCode=region_code,
                                relatedToVideoId=trending_video['id'],
                                maxResults=max_results).execute()
                            no_response = False
                        except SocketError as e:
                            if e.errno != errno.ECONNRESET:
                                logging.info("Other socket error!")
                                raise
                            else:
                                connection_reset_by_peer = connection_reset_by_peer + 1
                                logging.info(
                                    "Connection reset by peer! {}".format(
                                        connection_reset_by_peer))
                                if connection_reset_by_peer <= 10:
                                    time.sleep(
                                        self.WAIT_WHEN_CONNECTION_RESET_BY_PEER
                                    )
                                    try:
                                        youtube = googleapiclient.discovery.build(
                                            serviceName="youtube",
                                            version="v3",
                                            developerKey=self.credentials[
                                                current_key]['developer_key'],
                                            cache_discovery=False)
                                    except UnknownApiNameOrVersion as e:
                                        service = read_dict_from_url(
                                            url=
                                            "https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest"
                                        )
                                        youtube = googleapiclient.discovery.build_from_document(
                                            service=service,
                                            developerKey=self.credentials[
                                                current_key]['developer_key'])
                                else:
                                    raise
                        except HttpError as e:
                            if "403" in str(e):
                                logging.info(
                                    "Invalid {} developer key: {}".format(
                                        current_key,
                                        self.credentials[current_key]
                                        ['developer_key']))
                                current_key = current_key + 1
                                if current_key >= len(self.credentials):
                                    raise
                                else:
                                    try:
                                        youtube = googleapiclient.discovery.build(
                                            serviceName="youtube",
                                            version="v3",
                                            developerKey=self.credentials[
                                                current_key]['developer_key'],
                                            cache_discovery=False)
                                    except UnknownApiNameOrVersion as e:
                                        service = read_dict_from_url(
                                            url=
                                            "https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest"
                                        )
                                        youtube = googleapiclient.discovery.build_from_document(
                                            service=service,
                                            developerKey=self.credentials[
                                                current_key]['developer_key'])
                            elif "Backend Error" in str(e):
                                # Backend errors are usually associated to getting
                                # recommended videos for a video that was deleted by the user.
                                # In that case, just move on.
                                logging.info(
                                    "Backend error. Video %s will be ignored",
                                    trending_video['id'])
                                no_response = False
                            elif "Not Found" in str(e):
                                # Backend errors are usually associated to getting
                                # recommended videos for a video that was deleted by the user.
                                # In that case, just move on.
                                logging.info(
                                    "Not Found error. Video %s will be ignored",
                                    trending_video['id'])
                                no_response = False
                            elif "404" in str(e):
                                logging.info(
                                    "Requested entity was not found. Video %s will be ignored",
                                    trending_video['id'])
                                no_response = False
                            elif "400" in str(e):
                                logging.info(
                                    "Invalid argument. Video %s will be ignored",
                                    trending_video['id'])
                                no_response = False
                            elif "503" in str(e):
                                logging.info("Service unavailable")
                                service_unavailable = service_unavailable + 1
                                if service_unavailable <= 10:
                                    time.sleep(
                                        self.WAIT_WHEN_SERVICE_UNAVAILABLE)
                                else:
                                    raise
                            else:
                                raise

                    rank = 1
                    for item in response.get('items', {}):
                        item['relatedToVideoId'] = trending_video['id']
                        item['retrieved_at'] = datetime.utcnow().strftime(
                            "%Y-%m-%d %H:%M:%S.%f")[:-3]
                        item['rank'] = rank
                        if creation_date is None:
                            rank = rank + 1
                            num_videos = num_videos + 1
                            json_writer.write("{}\n".format(json.dumps(item)))
                        else:
                            item['snippet']['publishedAt'] = item['snippet'][
                                'publishedAt'].rstrip('Z').replace('T', ' ')
                            if rank <= self.NUMBER_OF_RELATED_VIDEOS:
                                if item['snippet'][
                                        'publishedAt'] <= creation_date + ' 00:00:00.000':
                                    rank = rank + 1
                                    num_videos = num_videos + 1
                                    json_writer.write("{}\n".format(
                                        json.dumps(item)))

        logging.info("Compress file %s", output_json)
        compressed_file = compress(filename=output_json, delete_original=True)

        s3 = boto3.resource('s3')
        if creation_date is None:
            s3_filename = "youtube_related_video/creation_date={creation_date}/{num_videos}.json.bz2".format(
                creation_date=datetime.utcnow().strftime("%Y-%m-%d"),
                num_videos=num_videos)
        else:
            s3_filename = "youtube_related_video/creation_date={creation_date}/{num_videos}.json.bz2".format(
                creation_date=creation_date, num_videos=num_videos)
        logging.info("Upload file %s to bucket %s at %s", compressed_file,
                     self.s3_data, s3_filename)
        s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename)

        logging.info("Recreate table for Youtube related video snippets")
        athena_db.query_athena_and_wait(
            query_string="DROP TABLE IF EXISTS youtube_related_video")
        athena_db.query_athena_and_wait(
            query_string=CREATE_VIDEO_RELATED_JSON.format(
                s3_bucket=self.s3_data))
        athena_db.query_athena_and_wait(
            query_string="MSCK REPAIR TABLE youtube_related_video")

        logging.info("Concluded collecting related video snippets")
Пример #7
0
    def collect_video_snippets(self):
        logging.info("Start collecting video snippets")
        athena = AthenaDatabase(database=self.athena_data, s3_output=self.s3_admin)
        if not athena.table_exists("youtube_video_snippet"):
            select_twitter_stream_video = SELECT_TWITTER_STREAM_VIDEO
            select_youtube_related_video = SELECT_YOUTUBE_RELATED_VIDEO
        else:
            logging.info("Table youtube_video_snippet exists")
            select_twitter_stream_video = SELECT_TWITTER_STREAM_VIDEO + EXTRA_TWITTER_STREAM_VIDEO
            select_youtube_related_video = SELECT_YOUTUBE_RELATED_VIDEO + EXTRA_YOUTUBE_RELATED_VIDEO
        queries = [select_twitter_stream_video]
        if athena.table_exists("youtube_related_video"):
            queries.append(select_youtube_related_video)
        query = " union all ".join(queries)
        query_count = SELECT_COUNT.format(query)
        query_group_by = SELECT_GROUP_BY.format(query)
        logging.info("Download IDs for all Youtube videos that have not been processed yet")
        video_count = int(athena.query_athena_and_get_result(query_string=query_count)['video_count'])
        logging.info("There are %d links to be processed: download them", video_count)
        video_ids_csv = athena.query_athena_and_download(query_string=query_group_by, filename="video_ids.csv")

        output_json = Path(Path(__file__).parent, 'tmp', 'youtube_video_snippet.json')
        Path(output_json).parent.mkdir(parents=True, exist_ok=True)
        current_key = 0
        try:
            youtube = googleapiclient.discovery.build(serviceName="youtube",
                                                      version="v3",
                                                      developerKey=
                                                      self.credentials[current_key]['developer_key'],
                                                      cache_discovery=False)
        except UnknownApiNameOrVersion as e:
            service = read_dict_from_url(url="https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest")
            youtube = googleapiclient.discovery.build_from_document(service=service,
                                                                    developerKey=self.credentials[current_key][
                                                                        'developer_key'])
        with open(video_ids_csv, newline='') as csv_reader:
            with open(output_json, 'w') as json_writer:
                reader = csv.DictReader(csv_reader)
                num_videos = 0
                for video_id in reader:
                    if num_videos % self.LOGGING_INTERVAL == 0:
                        logging.info("%d out of %d videos processed", num_videos, video_count)
                    num_videos = num_videos + 1

                    connection_reset_by_peer = 0
                    service_unavailable = 0
                    no_response = True
                    response = dict()
                    while no_response:
                        try:
                            response = youtube.videos().list(part="snippet",id=video_id['video_id']).execute()
                            no_response = False
                        except SocketError as e:
                            if e.errno != errno.ECONNRESET:
                                logging.info("Other socket error!")
                                raise
                            else:
                                connection_reset_by_peer = connection_reset_by_peer + 1
                                logging.info("Connection reset by peer! {}".format(connection_reset_by_peer))
                                if connection_reset_by_peer <= 10:
                                    time.sleep(self.WAIT_WHEN_CONNECTION_RESET_BY_PEER)
                                    try:
                                        youtube = googleapiclient.discovery.build(serviceName="youtube",
                                                                                  version="v3",
                                                                                  developerKey=
                                                                                  self.credentials[current_key][
                                                                                      'developer_key'],
                                                                                  cache_discovery=False)
                                    except UnknownApiNameOrVersion as e:
                                        service = read_dict_from_url(
                                            url="https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest")
                                        youtube = googleapiclient.discovery.build_from_document(service=service,
                                                                                                developerKey=
                                                                                                self.credentials[
                                                                                                    current_key][
                                                                                                    'developer_key'])
                                else:
                                    raise
                        except HttpError as e:
                            if "403" in str(e):
                                logging.info("Invalid {} developer key: {}".format(
                                    current_key,
                                    self.credentials[current_key]['developer_key']))
                                current_key = current_key + 1
                                if current_key >= len(self.credentials):
                                    raise
                                else:
                                    try:
                                        youtube = googleapiclient.discovery.build(serviceName="youtube",
                                                                                  version="v3",
                                                                                  developerKey=
                                                                                  self.credentials[current_key][
                                                                                      'developer_key'],
                                                                                  cache_discovery=False)
                                    except UnknownApiNameOrVersion as e:
                                        service = read_dict_from_url(
                                            url="https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest")
                                        youtube = googleapiclient.discovery.build_from_document(service=service,
                                                                                                developerKey=
                                                                                                self.credentials[
                                                                                                    current_key][
                                                                                                    'developer_key'])
                            elif "503" in str(e):
                                logging.info("Service unavailable")
                                service_unavailable = service_unavailable + 1
                                if service_unavailable <= 10:
                                    time.sleep(self.WAIT_WHEN_SERVICE_UNAVAILABLE)
                                else:
                                    raise
                            else:
                                raise
                    if len(response.get('items', [])) == 0:
                        response['id'] = video_id['video_id']
                        response['retrieved_at'] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
                        response['description'] = "Video unavailable. It has probably been removed by the user."
                        json_writer.write("{}\n".format(json.dumps(response)))
                    else:
                        for item in response['items']:
                            item['snippet']['publishedAt'] = item['snippet']['publishedAt'].rstrip('Z').replace('T', ' ')
                            item['retrieved_at'] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
                            json_writer.write("{}\n".format(json.dumps(item)))

        logging.info("Compress file %s", output_json)
        compressed_file = compress(filename=output_json, delete_original=True)

        s3 = boto3.resource('s3')
        s3_filename = "youtube_video_snippet/creation_date={}/{}-{}.json.bz2".format(datetime.utcnow().strftime("%Y-%m-%d"),
                                                                                     uuid.uuid4().hex,
                                                                                     num_videos)
        logging.info("Upload file %s to bucket %s at %s", compressed_file, self.s3_data, s3_filename)
        s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename)

        logging.info("Recreate table for Youtube channel stats")
        athena.query_athena_and_wait(query_string="DROP TABLE IF EXISTS youtube_video_snippet")
        athena.query_athena_and_wait(query_string=CREATE_VIDEO_SNIPPET_JSON.format(s3_bucket=self.s3_data))
        athena.query_athena_and_wait(query_string="MSCK REPAIR TABLE youtube_video_snippet")

        logging.info("Concluded collecting video snippets")
Пример #8
0
    def collect_channel_stats(self):
        logging.info("Start collecting Youtube channel stats")
        channel_ids = Path(Path(__file__).parent, 'tmp', 'channel_ids.csv')
        athena = AthenaDatabase(database=self.athena_data,
                                s3_output=self.s3_admin)
        athena.query_athena_and_download(query_string=SELECT_DISTINCT_CHANNEL,
                                         filename=channel_ids)
        channel_count = int(
            athena.query_athena_and_get_result(
                query_string=SELECT_COUNT_DISTINCT_CHANNEL)['channel_count'])
        logging.info("There are %d channels to be processed: download them",
                     channel_count)

        current_key = 0
        youtube = googleapiclient.discovery.build(
            serviceName="youtube",
            version="v3",
            developerKey=self.credentials[current_key]['developer_key'],
            cache_discovery=False)
        with open(channel_ids, newline='') as csv_reader:
            output_json = Path(
                Path(__file__).parent, 'tmp', 'youtube_channel_stats.json')
            with open(output_json, 'w') as json_writer:
                reader = csv.DictReader(csv_reader)
                num_channels = 0
                for channel_id in reader:
                    if num_channels % self.LOGGING_INTERVAL == 0:
                        logging.info("%d out of %d channels processed",
                                     num_channels, channel_count)
                    num_channels = num_channels + 1

                    service_unavailable = 0
                    no_response = True
                    while no_response:
                        try:
                            response = youtube.channels().list(
                                part="statistics",
                                id=channel_id['channel_id']).execute()
                            no_response = False
                        except HttpError as e:
                            if "403" in str(e):
                                logging.info(
                                    "Invalid {} developer key: {}".format(
                                        current_key,
                                        self.credentials[current_key]
                                        ['developer_key']))
                                current_key = current_key + 1
                                if current_key >= len(self.credentials):
                                    raise
                                else:
                                    youtube = googleapiclient.discovery.build(
                                        serviceName="youtube",
                                        version="v3",
                                        developerKey=self.credentials[
                                            current_key]['developer_key'],
                                        cache_discovery=False)
                            elif "503" in str(e):
                                logging.info("Service unavailable")
                                service_unavailable = service_unavailable + 1
                                if service_unavailable <= 10:
                                    time.sleep(
                                        self.WAIT_WHEN_SERVICE_UNAVAILABLE)
                                else:
                                    raise
                            else:
                                raise
                    for item in response.get('items', []):
                        item['retrieved_at'] = datetime.utcnow().strftime(
                            "%Y-%m-%d %H:%M:%S.%f")[:-3]
                        json_writer.write("{}\n".format(json.dumps(item)))

        logging.info("Compress file %s", output_json)
        compressed_file = compress(filename=output_json, delete_original=True)

        s3 = boto3.resource('s3')
        s3_filename = "youtube_channel_stats/creation_date={}/{}.json.bz2".format(
            datetime.utcnow().strftime("%Y-%m-%d"), num_channels)
        logging.info("Upload file %s to bucket %s at %s", compressed_file,
                     self.s3_data, s3_filename)
        s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename)

        logging.info("Recreate table for Youtube channel stats")
        athena.query_athena_and_wait(
            query_string="DROP TABLE IF EXISTS youtube_channel_stats")
        athena.query_athena_and_wait(
            query_string=CREATE_CHANNEL_STATS_JSON.format(
                s3_bucket=self.s3_data))
        athena.query_athena_and_wait(
            query_string="MSCK REPAIR TABLE youtube_channel_stats")

        logging.info("Concluded collecting channel stats")
    def inner_retrieve_transcript(self, project, speaker, performance_date,
                                  speaker_type, part, timeframe, language,
                                  filepath, microsoft, ibm, aws, google):
        athena_db = AthenaDatabase(database=self.config['aws']['athena'],
                                   s3_output=self.bucket)

        # delete existing sections
        if microsoft:
            self.delete_different_timeframe(service='microsoft',
                                            project=project,
                                            speaker=speaker,
                                            performance_date=performance_date,
                                            speaker_type=speaker_type,
                                            part=part,
                                            timeframe=timeframe)
        if google:
            self.delete_different_timeframe(service='google',
                                            project=project,
                                            speaker=speaker,
                                            performance_date=performance_date,
                                            speaker_type=speaker_type,
                                            part=part,
                                            timeframe=timeframe)
        if ibm:
            self.delete_different_timeframe(service='ibm',
                                            project=project,
                                            speaker=speaker,
                                            performance_date=performance_date,
                                            speaker_type=speaker_type,
                                            part=part,
                                            timeframe=timeframe)
        if aws:
            self.delete_different_timeframe(service='aws',
                                            project=project,
                                            speaker=speaker,
                                            performance_date=performance_date,
                                            speaker_type=speaker_type,
                                            part=part,
                                            timeframe=timeframe)
        if self.repair_metadata:
            self.repair_table_metadata()

        # create audio object and slice it according to timeframe (in hours)
        extension = Path(filepath).suffix[1:]
        sound = AudioSegment.from_file(filepath, extension)
        sound = sound.set_channels(1)
        if (timeframe * 60 * 60) > 13200.0:  # more than 3 hours and 40 minutes
            self.instance_type = 't3a.micro'
        chunk_length_ms = timeframe * 60 * 60 * 1000  # pydub calculates in millisec
        chunks = make_chunks(sound,
                             chunk_length_ms)  # Make chunks of ten seconds

        # determine list of jobs that need to be performed
        jobs = list()
        for i in range(1, len(chunks) + 1):
            if microsoft:
                jobs.append({
                    'project': project,
                    'speaker': speaker,
                    'performance_date': performance_date,
                    'speaker_type': speaker_type,
                    'part': part,
                    'timeframe': timeframe,
                    'section': i,
                    'service': 'microsoft'
                })
            if ibm:
                jobs.append({
                    'project': project,
                    'speaker': speaker,
                    'performance_date': performance_date,
                    'speaker_type': speaker_type,
                    'part': part,
                    'timeframe': timeframe,
                    'section': i,
                    'service': 'ibm'
                })
            if aws:
                jobs.append({
                    'project': project,
                    'speaker': speaker,
                    'performance_date': performance_date,
                    'speaker_type': speaker_type,
                    'part': part,
                    'timeframe': timeframe,
                    'section': i,
                    'service': 'aws'
                })
            if google:
                jobs.append({
                    'project': project,
                    'speaker': speaker,
                    'performance_date': performance_date,
                    'speaker_type': speaker_type,
                    'part': part,
                    'timeframe': timeframe,
                    'section': i,
                    'service': 'google'
                })
        jobs_values = ""
        for i in range(len(jobs)):
            jobs_values = f"('{jobs[i]['service']}','{jobs[i]['project']}','{jobs[i]['speaker']}'," \
                          f"'{jobs[i]['performance_date']}',{jobs[i]['part']},'{jobs[i]['speaker_type']}'," \
                          f"{jobs[i]['timeframe']},{jobs[i]['section']}),{jobs_values}"
        jobs_values = jobs_values[:-1]  # eliminate the final comma
        jobs_athena = athena_db.query_athena_and_download(
            query_string=SELECT_JOBS.format(jobs_values=jobs_values),
            filename='jobs.csv')
        with open(jobs_athena) as jobs_file:
            jobs_reader = csv.DictReader(jobs_file)
            jobs = list()
            for row in jobs_reader:
                jobs.append({
                    'service': row['service'],
                    'project': row['project'],
                    'speaker': row['speaker'],
                    'performance_date': row['performance_date'],
                    'part': row['part'],
                    'speaker_type': row['speaker_type'],
                    'timeframe': row['timeframe'],
                    'section': row['section']
                })

        # export audio e instantiate cloud transcribers
        if len(jobs) > 0:
            created_audio = False
            Path('./audio/').mkdir(parents=True, exist_ok=True)
            destination = list()
            try:
                # export audio
                for i, chunk in enumerate(chunks):
                    temp_sound_file = f"./audio/{uuid.uuid4()}.wav"
                    chunk.export(temp_sound_file,
                                 format="wav",
                                 parameters=['-acodec', 'pcm_s16le'])
                    created_audio = True
                    destination.append(temp_sound_file)
                # instantiate cloud transcribers
                for job in jobs:
                    self.instantiate_cloud_transcriber(
                        service=job['service'],
                        project=job['project'],
                        performance_date=job['performance_date'],
                        part=job['part'],
                        timeframe=job['timeframe'],
                        section=job['section'],
                        language=language,
                        speaker=job['speaker'],
                        speaker_type=job['speaker_type'],
                        filepath=destination[int(job['section']) -
                                             1])  # destination is zero-based
            finally:
                if created_audio:
                    shutil.rmtree("./audio")
Пример #10
0
    def collect_channel_stats(self):
        logging.info("Start collecting Youtube channel stats - producer")

        sqs = boto3.resource('sqs')
        credentials_queue = sqs.get_queue_by_name(
            QueueName='youtube_credentials')
        credentials_queue.purge()
        logging.info('Cleaned queue for credentials')
        for credential in self.credentials:
            credentials_queue.send_message(
                MessageBody=credential['developer_key'])
        logging.info('Enqueued credentials')

        athena = AthenaDatabase(database=self.athena_data,
                                s3_output=self.s3_admin)
        logging.info("Recreate table for Youtube channel stats")
        athena.query_athena_and_wait(
            query_string="DROP TABLE IF EXISTS youtube_channel_stats")
        athena.query_athena_and_wait(
            query_string=CREATE_CHANNEL_STATS_JSON.format(
                s3_bucket=self.s3_data))
        athena.query_athena_and_wait(
            query_string="MSCK REPAIR TABLE youtube_channel_stats")
        channel_ids = Path(Path(__file__).parent, 'tmp', 'channel_ids.csv')
        athena.query_athena_and_download(query_string=SELECT_DISTINCT_CHANNEL,
                                         filename=channel_ids)
        channel_count = int(
            athena.query_athena_and_get_result(
                query_string=SELECT_COUNT_DISTINCT_CHANNEL)['channel_count'])
        logging.info("There are %d channels to be processed: download them",
                     channel_count)

        with open(channel_ids, newline='') as csv_reader:
            reader = csv.DictReader(csv_reader)
            channels_queue = sqs.get_queue_by_name(
                QueueName='youtube_channels')
            channels_queue.purge()
            logging.info('Cleaned queue for channels')
            channels_message = list()
            for channel_id in reader:
                channels_message.append(channel_id['channel_id'])
                if len(channels_message) == self.QUEUING_INTERVAL:
                    channels_queue.send_message(
                        MessageBody=json.dumps(channels_message))
                    channels_message = list()
                    logging.info("Added %d channels to the queue",
                                 self.QUEUING_INTERVAL)
            if len(channels_message) != 0:
                channels_queue.send_message(
                    MessageBody=json.dumps(channels_message))
                logging.info("Added %d channels to the queue",
                             len(channels_message))

        logging.info("Concluded collecting channel stats - producer")
def import_data(related_date, end_related_date, graph_date_difference,
                timespan):
    database = sqlite3.connect('./youtube_recommendations.sqlite')
    sqlite_aws = SqliteAWS(database=database,
                           s3_admin='internet-scholar-admin',
                           s3_data='internet-scholar',
                           athena_db='internet_scholar')
    logging.info('Retrieve recommendations...')
    sqlite_aws.convert_athena_query_to_sqlite(
        table_name='recommendation_aux',
        query=RECOMMENDATION.format(begin_date=str(related_date),
                                    end_date=str(end_related_date)))

    logging.info('Add primary key to recommendation table...')
    database.execute(CREATE_TABLE_RECOMMENDATION)
    database.execute(INSERT_TABLE_RECOMMENDATION)
    database.execute('DROP TABLE recommendation_aux')
    logging.info('Update categories and null values...')
    database.execute(UPDATE_CATEGORY_SEED)
    database.execute(UPDATE_CATEGORY_RECOMMENDED)
    database.execute(UPDATE_NULL_SEED)
    database.execute(UPDATE_NULL_RECOMMENDED)

    logging.info('Retrieve Twitter users and YouTube channel data...')
    initial_date = related_date + timedelta(
        days=graph_date_difference) - timedelta(days=timespan - 1)
    final_date = end_related_date + timedelta(days=graph_date_difference)
    sqlite_aws.convert_athena_query_to_sqlite(
        table_name='twitter_user_channel',
        query=TWITTER_USER_CHANNEL.format(initial_date=str(initial_date),
                                          final_date=str(final_date)))

    logging.info('Calculate number of common Twitter users per channel...')
    database.execute(CREATE_YOUTUBE_CHANNEL_COMMON_TWITTER_USERS)
    current_date = related_date
    while current_date <= end_related_date:
        logging.info(str(current_date))
        initial_date = current_date + timedelta(
            days=graph_date_difference) - timedelta(days=timespan - 1)
        final_date = current_date + timedelta(days=graph_date_difference)
        database.execute(
            INSERT_YOUTUBE_CHANNEL_COMMON_TWITTER_USERS.format(
                initial_date=initial_date, final_date=final_date))
        current_date = current_date + timedelta(days=1)
    logging.info('Update aggregate on SQLite table 1...')
    database.execute(
        "ALTER TABLE recommendation ADD COLUMN seed_user_count INT")
    database.execute(UPDATE_SEED_USER_COUNT)
    logging.info('Update aggregate on SQLite table 2...')
    database.execute(
        "ALTER TABLE recommendation ADD COLUMN recommended_user_count INT")
    database.execute(UPDATE_RECOMMENDED_USER_COUNT)
    logging.info('Update aggregate on SQLite table 3...')
    database.execute(
        "ALTER TABLE recommendation ADD COLUMN common_user_count INT")
    database.execute(UPDATE_COMMON_USER_COUNT)

    logging.info('Retrieve info about political leaning...')
    sqlite_aws.convert_athena_query_to_sqlite(
        table_name='channel_political_leaning',
        query=SELECT_POLITICAL_LEANING.format(
            initial_date=str(related_date), final_date=str(end_related_date)))
    logging.info('Update political leaning info on SQLite 1...')
    database.execute(
        "ALTER TABLE recommendation ADD COLUMN seed_political_leaning TEXT")
    database.execute(UPDATE_SEED_POLITICAL_LEANING)
    logging.info('Update political leaning info on SQLite 1...')
    database.execute(
        "ALTER TABLE recommendation ADD COLUMN recommended_political_leaning TEXT"
    )
    database.execute(UPDATE_RECOMMENDED_POLITICAL_LEANING)

    logging.info('Retrieve data on channel stats...')
    athena_db = AthenaDatabase(database='internet_scholar',
                               s3_output='internet-scholar-admin')
    athena_db.query_athena_and_wait(
        query_string=CREATE_VIEW_ENHANCED_CHANNEL_STATS)
    sqlite_aws.convert_athena_query_to_sqlite(
        table_name='channel_stats',
        query=SELECT_ENHANCED_STATS.format(initial_date=str(related_date),
                                           final_date=str(end_related_date)))
    logging.info('Add primary key to channel stats...')
    database.execute(CREATE_CHANNEL_STATS_WITH_PRIMARY_KEY)
    database.execute(INSERT_CHANNEL_STATS_WITH_PRIMARY_KEY)
    add_stat_to_sqlite(database, field='view_count')
    add_stat_to_sqlite(database, field='cumulative_view_count')
    add_stat_to_sqlite(database, field='subscriber_count')
    add_stat_to_sqlite(database, field='cumulative_subscriber_count')
    add_stat_to_sqlite(database, field='video_count')
    add_stat_to_sqlite(database, field='cumulative_video_count')
    add_stat_to_sqlite(database, field='comment_count')
    add_stat_to_sqlite(database, field='cumulative_comment_count')

    database.execute('DROP TABLE channel_political_leaning')
    database.execute('DROP TABLE channel_stats')
    database.execute('DROP TABLE channel_stats_with_primary_key')
    database.execute('DROP TABLE twitter_user_channel')
    database.execute('DROP TABLE youtube_channel_common_twitter_users')
    database.commit()

    database.execute('VACUUM')
    database.close()

    new_filename = compress('./youtube_recommendations.sqlite')
    s3_filename = "youtube_data_export_r/{timestamp}.sqlite.bz2".format(
        timestamp=datetime.utcnow().strftime("%Y%m%d-%H%M%S"))
    s3 = boto3.resource('s3')
    s3.Bucket('internet-scholar').upload_file(str(new_filename), s3_filename)
    def export_google_sheets(self,
                             project=None,
                             speaker=None,
                             interval_in_seconds=10):
        self.parse_words(project=project, speaker=speaker)

        Path("./csv/").mkdir(parents=True, exist_ok=True)
        print("Export CSVs...")
        try:
            json_string = json.dumps(self.config['google'])
            Path('./local_credentials/').mkdir(parents=True, exist_ok=True)
            temp_file = f"./local_credentials/{uuid.uuid4()}.json"
            with open(temp_file, 'w', encoding="utf-8") as json_file:
                json_file.write(json_string)
            try:
                credentials_google_drive = service_account.Credentials.from_service_account_file(
                    temp_file,
                    scopes=['https://www.googleapis.com/auth/drive'])
                credentials_google_sheets = service_account.Credentials.from_service_account_file(
                    temp_file,
                    scopes=['https://www.googleapis.com/auth/spreadsheets'])
            finally:
                shutil.rmtree('./local_credentials')
            google_drive = build('drive',
                                 'v3',
                                 credentials=credentials_google_drive)
            google_sheets = build('sheets',
                                  'v4',
                                  credentials=credentials_google_sheets)

            athena_db = AthenaDatabase(database=self.config['aws']['athena'],
                                       s3_output=self.bucket)

            all_projects = athena_db.query_athena_and_download(
                query_string=SELECT_ALL_PROJECTS.format(
                    where_clause=self.get_where_clause(project=project,
                                                       speaker=speaker)),
                filename='selected_all_projects.csv')
            with open(all_projects) as all_projects_csv:
                projects_reader = csv.DictReader(all_projects_csv)
                for projects_row in projects_reader:
                    response_project = google_drive.files().list(
                        q=f"mimeType='application/vnd.google-apps.folder' and "
                        f"'{self.config['google']['transcription_folder']}' in parents and "
                        f"name='{projects_row['project']}'",
                        spaces='drive',
                        fields='files(id, name)').execute()
                    if len(response_project['files']) == 0:
                        folder_metadata = {
                            'name':
                            projects_row['project'],
                            'mimeType':
                            'application/vnd.google-apps.folder',
                            'parents': [
                                self.config['google']['transcription_folder'],
                            ]
                        }
                        project_folder = google_drive.files().create(
                            body=folder_metadata, fields='id').execute()
                        project_id = project_folder['id']
                    elif len(response_project['files']) == 1:
                        project_id = response_project['files'][0]['id']
                    else:
                        raise Exception(
                            "Error! Should not have more than 1 folder for this project!"
                        )

                    all_speakers = athena_db.query_athena_and_download(
                        query_string=SELECT_ALL_SPEAKERS.format(
                            where_clause=self.get_where_clause(
                                project=projects_row['project'],
                                speaker=speaker)),
                        filename='selected_all_speakers.csv')
                    with open(all_speakers) as all_speakers_csv:
                        speakers_reader = csv.DictReader(all_speakers_csv)
                        for speakers_row in speakers_reader:
                            response_spreadsheet = google_drive.files().list(
                                q=
                                f"mimeType='application/vnd.google-apps.spreadsheet' and '{project_id}' in parents and name='{speakers_row['speaker']}'",
                                spaces='drive',
                                fields='files(id, name)').execute()
                            if len(response_spreadsheet['files']) == 1:
                                print(
                                    f"Spreadsheet for {speakers_row['speaker']} already exists. I will not overwrite."
                                )
                            elif len(response_spreadsheet['files']) >= 2:
                                raise Exception(
                                    "Error! Should not have more than 1 spreadsheet for this project!"
                                )
                            else:  # it is 0
                                body = {
                                    'mimeType':
                                    'application/vnd.google-apps.spreadsheet',
                                    'name': speakers_row['speaker'],
                                    'parents': [
                                        project_id,
                                    ]
                                }
                                response = google_drive.files().create(
                                    body=body, fields='id').execute()
                                speaker_id = response['id']
                                all_parts = athena_db.query_athena_and_download(
                                    query_string=SELECT_ALL_PARTS.format(
                                        where_clause=self.get_where_clause(
                                            project=projects_row['project'],
                                            speaker=speakers_row['speaker'])),
                                    filename='selected_all_parts.csv')
                                with open(all_parts) as all_parts_csv:
                                    parts_reader = csv.DictReader(
                                        all_parts_csv)
                                    first_sheet = True
                                    for parts_row in parts_reader:
                                        filename = f"{projects_row['project']}_{speakers_row['speaker']}_" \
                                                   f"{parts_row['performance_date']}_{parts_row['part']}_{interval_in_seconds}.csv"
                                        print(filename)
                                        new_file = athena_db.query_athena_and_download(
                                            SELECT_TRANSCRIPT.format(
                                                project=projects_row[
                                                    'project'],
                                                speaker=speakers_row[
                                                    'speaker'],
                                                performance_date=parts_row[
                                                    'performance_date'],
                                                part=parts_row['part'],
                                                interval_in_seconds=
                                                interval_in_seconds), filename)
                                        os.replace(new_file,
                                                   f'./csv/{filename}')
                                        if first_sheet:
                                            body = {
                                                'requests': {
                                                    "updateSheetProperties": {
                                                        "fields":
                                                        "title,gridProperties.rowCount,gridProperties.columnCount,gridProperties.frozenRowCount",
                                                        "properties": {
                                                            "title":
                                                            f"{parts_row['performance_date']} / {parts_row['part']}",
                                                            "gridProperties": {
                                                                "rowCount": 3,
                                                                "columnCount":
                                                                3,
                                                                "frozenRowCount":
                                                                1
                                                            },
                                                            "index": 0
                                                        }
                                                    }
                                                },
                                                'includeSpreadsheetInResponse':
                                                True
                                            }
                                            response = google_sheets.spreadsheets(
                                            ).batchUpdate(
                                                spreadsheetId=speaker_id,
                                                body=body).execute()
                                            sheet_id = response[
                                                'updatedSpreadsheet'][
                                                    'sheets'][0]['properties'][
                                                        'sheetId']
                                            first_sheet = False
                                        else:
                                            body = {
                                                "requests": {
                                                    "addSheet": {
                                                        "properties": {
                                                            "title":
                                                            f"{parts_row['performance_date']} / {parts_row['part']}",
                                                            "gridProperties": {
                                                                "rowCount": 3,
                                                                "columnCount":
                                                                3,
                                                                "frozenRowCount":
                                                                1
                                                            }
                                                        }
                                                    }
                                                }
                                            }
                                            response = google_sheets.spreadsheets(
                                            ).batchUpdate(
                                                spreadsheetId=speaker_id,
                                                body=body).execute()
                                            sheet_id = response['replies'][0][
                                                'addSheet']['properties'][
                                                    'sheetId']
                                        with open(
                                                f'./csv/{filename}',
                                                'r',
                                                encoding="utf-8") as csv_file:
                                            csvContents = csv_file.read()
                                        body = {
                                            'requests': [{
                                                'pasteData': {
                                                    "coordinate": {
                                                        "sheetId": sheet_id,
                                                        "rowIndex":
                                                        "0",  # adapt this if you need different positioning
                                                        "columnIndex":
                                                        "0",  # adapt this if you need different positioning
                                                    },
                                                    "data": csvContents,
                                                    "type": 'PASTE_NORMAL',
                                                    "delimiter": ',',
                                                }
                                            }]
                                        }
                                        response = google_sheets.spreadsheets(
                                        ).batchUpdate(spreadsheetId=speaker_id,
                                                      body=body).execute()
                                        body = {
                                            "requests": [{
                                                "repeatCell": {
                                                    "range": {
                                                        "sheetId": sheet_id,
                                                        "startRowIndex": 0,
                                                        "startColumnIndex": 0
                                                    },
                                                    "cell": {
                                                        "userEnteredFormat": {
                                                            "verticalAlignment":
                                                            "TOP",
                                                            "wrapStrategy":
                                                            "WRAP"
                                                        },
                                                    },
                                                    "fields":
                                                    "userEnteredFormat.wrapStrategy,userEnteredFormat.verticalAlignment"
                                                }
                                            }, {
                                                "updateDimensionProperties": {
                                                    "range": {
                                                        "sheetId": sheet_id,
                                                        "dimension": "COLUMNS",
                                                        "startIndex": 0,
                                                        "endIndex": 1
                                                    },
                                                    "properties": {
                                                        "pixelSize": 60
                                                    },
                                                    "fields": "pixelSize"
                                                }
                                            }, {
                                                "updateDimensionProperties": {
                                                    "range": {
                                                        "sheetId": sheet_id,
                                                        "dimension": "COLUMNS",
                                                        "startIndex": 1
                                                    },
                                                    "properties": {
                                                        "pixelSize": 280
                                                    },
                                                    "fields": "pixelSize"
                                                }
                                            }]
                                        }
                                        response = google_sheets.spreadsheets(
                                        ).batchUpdate(spreadsheetId=speaker_id,
                                                      body=body).execute()
        finally:
            shutil.rmtree('./csv')
    def parse_words(self,
                    project=None,
                    speaker=None,
                    performance_date=None,
                    part=None):
        self.repair_table_metadata()
        self.repair_table_word()

        where_clause = self.get_where_clause(project=project,
                                             speaker=speaker,
                                             performance_date=performance_date,
                                             part=part)
        if where_clause == '':
            select = SELECT_NON_PARSED_TRANSCRIPTS.format(
                where_clause=where_clause).replace(' and ', ' where ',
                                                   1).replace('\n\n', '\n')
        else:
            select = SELECT_NON_PARSED_TRANSCRIPTS.format(
                where_clause=where_clause)
        athena_db = AthenaDatabase(database=self.config['aws']['athena'],
                                   s3_output=self.bucket)
        unparsed_records = athena_db.query_athena_and_download(
            query_string=select, filename='unparsed_records.csv')
        with open(unparsed_records) as unparsed_file:
            reader = csv.DictReader(unparsed_file)
            database_has_changed = False
            try:
                print("Parse words...")
                for row in reader:
                    print(
                        f"{row['speaker']}_{row['performance_date']}_{row['part']}_{row['service']}_{row['speaker_type']}_{row['section']}"
                    )
                    transcript = read_dict_from_s3(
                        self.bucket,
                        f"transcript/service={row['service']}/project={row['project']}/speaker={row['speaker']}/"
                        f"performance_date={row['performance_date']}/part={row['part']}/"
                        f"speaker_type={row['speaker_type']}/timeframe={row['timeframe']}/"
                        f"section={row['section']}/transcript.json.bz2",
                        compressed=True)
                    protagonist_words, non_protagonist_words = parse_words(
                        transcript=transcript,
                        speaker_type=row['speaker_type'],
                        service=row['service'])
                    partitions = OrderedDict()
                    partitions['project'] = row['project']
                    partitions['speaker'] = row['speaker']
                    partitions['performance_date'] = row['performance_date']
                    partitions['part'] = row['part']
                    partitions['service'] = row['service']
                    partitions['protagonist'] = -1
                    partitions['timeframe'] = row['timeframe']
                    partitions['section'] = row['section']
                    if len(protagonist_words) > 0:
                        partitions['protagonist'] = 1
                        save_data_in_s3(content=protagonist_words,
                                        s3_bucket=self.bucket,
                                        s3_key='word.json',
                                        prefix='word',
                                        partitions=partitions)
                    if len(non_protagonist_words) > 0:
                        partitions['protagonist'] = 0
                        save_data_in_s3(content=non_protagonist_words,
                                        s3_bucket=self.bucket,
                                        s3_key='word.json',
                                        prefix='word',
                                        partitions=partitions)
                        database_has_changed = True
            finally:
                if database_has_changed:
                    self.repair_table_word()
 def repair_table_word(self):
     print("Going to repair table word...")
     athena_db = AthenaDatabase(database=self.config['aws']['athena'],
                                s3_output=self.bucket)
     athena_db.query_athena_and_wait(query_string="MSCK REPAIR TABLE word")
     print("Done.")
Пример #15
0
    def export_tweepy(self, yesterday):
        json_file = Path(Path(__file__).parent, 'tmp', 'tweepy_video_id.json')
        self.create_json_tweepy_file(source="tweet_from_video_id",
                                     destination=json_file)
        json_video_id_file_compressed = compress(json_file)
        json_file = Path(
            Path(__file__).parent, 'tmp', 'tweepy_user_screen.json')
        self.create_json_tweepy_file(source="tweet_from_screen_name",
                                     destination=json_file)
        json_screen_name_file_compressed = compress(json_file)

        s3 = boto3.resource('s3')
        s3_filename = "tweepy_video_id/reference_date={}/tweepy_from_video_id.json.bz2".format(
            yesterday)
        s3.Bucket(self.s3_data).upload_file(str(json_video_id_file_compressed),
                                            s3_filename)

        s3_filename = "tweepy_screen_name/reference_date={}/tweepy_from_screen_name.json.bz2".format(
            yesterday)
        s3.Bucket(self.s3_data).upload_file(
            str(json_screen_name_file_compressed), s3_filename)

        athena_db = AthenaDatabase(database=self.athena_data,
                                   s3_output=self.s3_admin)
        athena_db.query_athena_and_wait(
            query_string="DROP TABLE tweepy_video_id")
        athena_db.query_athena_and_wait(
            query_string=ATHENA_CREATE_TWEEPY_VIDEO_ID.format(
                structure=STRUCTURE_TWEEPY_ATHENA, s3_bucket=self.s3_data))
        athena_db.query_athena_and_wait(
            query_string="MSCK REPAIR TABLE tweepy_video_id")

        athena_db.query_athena_and_wait(
            query_string="DROP TABLE tweepy_screen_name")
        athena_db.query_athena_and_wait(
            query_string=ATHENA_CREATE_TWEEPY_SCREEN_NAME.format(
                structure=STRUCTURE_TWEEPY_ATHENA, s3_bucket=self.s3_data))
        athena_db.query_athena_and_wait(
            query_string="MSCK REPAIR TABLE tweepy_screen_name")
Пример #16
0
# This script was used when I decided to create a table called tweet_user_url to unnest the URLs in each Tweet.
# To make backward compatibility possible I had to create the data files for previous dates (< 2019-11-28).

from internet_scholar import AthenaDatabase, compress
import boto3
from datetime import date, timedelta
import uuid

current_date = date(2019, 8, 15)
athena_db = AthenaDatabase(database='internet_scholar',
                           s3_output='internet-scholar-admin')
query = """
select
  twitter_stream.id_str as tweet_id,
  twitter_stream.user.id_str as user_id,
  url.expanded_url as url
from
  internet_scholar.twitter_stream_raw as twitter_stream,
  unnest(entities.urls) as t(url)
where
  creation_date = '{creation_date}' and
  url.display_url not like 'twitter.com/%'
order by
  tweet_id,
  user_id,
  url;
"""

while current_date <= date(2019, 11, 27):
    print(str(current_date))
    tweet_user_url = athena_db.query_athena_and_download(
Пример #17
0
def create_louvain(min_users, timespan, final_date, end):
    s3 = boto3.resource('s3')
    athena_db = AthenaDatabase(database='internet_scholar',
                               s3_output='internet-scholar-admin')
    while final_date <= end:
        print('Louvain - {}'.format(str(final_date)))
        edges = athena_db.query_athena_and_download(
            query_string=EDGES_LOUVAIN.format(final_date=str(final_date),
                                              min_users=min_users,
                                              timespan=timespan),
            filename='edges_louvain.csv')
        g = nx.Graph()
        with open(edges, newline='', encoding="utf8") as csv_reader:
            reader = csv.DictReader(csv_reader)
            for edge in reader:
                g.add_edge(edge['source_id'],
                           edge['target_id'],
                           weight=int(edge['weight']))

        with open('./louvain.csv', 'w', encoding="utf8") as csv_writer:
            writer = csv.DictWriter(csv_writer,
                                    fieldnames=[
                                        'resolution', 'channel_id', 'cluster',
                                        'graph_size', 'cluster_size',
                                        'cluster_count'
                                    ],
                                    dialect='unix')
            writer.writeheader()
            nodes = list(g)
            graph_size = len(nodes)
            for resolution in numpy.arange(10, 0, -0.1):
                partition = community.best_partition(g,
                                                     resolution=resolution,
                                                     randomize=False)
                cluster_count = len(set(partition.values()))
                for partition_number in set(partition.values()):
                    new_partition = list()
                    for channel_id in partition.keys():
                        if partition[channel_id] == partition_number:
                            new_partition.append(channel_id)
                    cluster_size = len(new_partition)
                    new_partition_number = nodes.index(min(new_partition))
                    for item in new_partition:
                        new_record = dict()
                        new_record['resolution'] = "{:.1f}".format(resolution)
                        new_record['channel_id'] = item
                        new_record['cluster'] = new_partition_number
                        new_record['graph_size'] = graph_size
                        new_record['cluster_size'] = cluster_size
                        new_record['cluster_count'] = cluster_count
                        writer.writerow(new_record)

        compressed_file = compress(filename='./louvain.csv',
                                   delete_original=True)
        s3_filename = "youtube_graph_louvain/min_users={min_users}/" \
                      "timespan={timespan}/final_date={final_date}/louvain.csv.bz2".format(
            min_users=min_users,
            timespan=timespan,
            final_date=str(final_date))
        s3.Bucket('internet-scholar').upload_file(str(compressed_file),
                                                  s3_filename)
        final_date = final_date + timedelta(days=1)
    athena_db.query_athena_and_wait(
        query_string='drop table if exists youtube_graph_louvain')
    athena_db.query_athena_and_wait(
        query_string=CREATE_YOUTUBE_GRAPH_LOUVAIN.format(
            s3_data='internet-scholar'))
    athena_db.query_athena_and_wait(
        query_string='MSCK REPAIR TABLE youtube_graph_louvain')
Пример #18
0
    def collect_video_snippets(self):
        logging.info("Start collecting video snippets")
        athena = AthenaDatabase(database=self.athena_data,
                                s3_output=self.s3_admin)
        query = SELECT_YOUTUBE_VIDEOS
        query_count = SELECT_COUNT_YOUTUBE_VIDEOS
        if athena.table_exists("youtube_video_snippet"):
            logging.info("Table youtube_video_snippet exists")
            query = query + TABLE_YOUTUBE_VIDEO_SNIPPET_EXISTS
            query_count = query_count + TABLE_YOUTUBE_VIDEO_SNIPPET_EXISTS
        logging.info(
            "Download IDs for all Youtube videos that have not been processed yet"
        )
        video_count = int(
            athena.query_athena_and_get_result(
                query_string=query_count)['video_count'])
        logging.info("There are %d links to be processed: download them",
                     video_count)
        video_ids_csv = athena.query_athena_and_download(
            query_string=query, filename="video_ids.csv")

        output_json = Path(
            Path(__file__).parent, 'tmp', 'youtube_video_snippet.json')
        Path(output_json).parent.mkdir(parents=True, exist_ok=True)
        current_key = 0
        youtube = googleapiclient.discovery.build(
            serviceName="youtube",
            version="v3",
            developerKey=self.credentials[current_key]['developer_key'],
            cache_discovery=False)
        with open(video_ids_csv, newline='') as csv_reader:
            with open(output_json, 'w') as json_writer:
                reader = csv.DictReader(csv_reader)
                num_videos = 0
                for video_id in reader:
                    if num_videos % self.LOGGING_INTERVAL == 0:
                        logging.info("%d out of %d videos processed",
                                     num_videos, video_count)
                    num_videos = num_videos + 1

                    service_unavailable = 0
                    no_response = True
                    while no_response:
                        try:
                            response = youtube.videos().list(
                                part="snippet",
                                id=video_id['video_id']).execute()
                            no_response = False
                        except HttpError as e:
                            if "403" in str(e):
                                logging.info(
                                    "Invalid {} developer key: {}".format(
                                        current_key,
                                        self.credentials[current_key]
                                        ['developer_key']))
                                current_key = current_key + 1
                                if current_key >= len(self.credentials):
                                    raise
                                else:
                                    youtube = googleapiclient.discovery.build(
                                        serviceName="youtube",
                                        version="v3",
                                        developerKey=self.credentials[
                                            current_key]['developer_key'],
                                        cache_discovery=False)
                            elif "503" in str(e):
                                logging.info("Service unavailable")
                                service_unavailable = service_unavailable + 1
                                if service_unavailable <= 10:
                                    time.sleep(
                                        self.WAIT_WHEN_SERVICE_UNAVAILABLE)
                                else:
                                    raise
                            else:
                                raise
                    if len(response.get('items', [])) == 0:
                        response['id'] = video_id['video_id']
                        response['retrieved_at'] = datetime.utcnow().strftime(
                            "%Y-%m-%d %H:%M:%S.%f")[:-3]
                        response[
                            'description'] = "Video unavailable. It has probably been removed by the user."
                        json_writer.write("{}\n".format(json.dumps(response)))
                    else:
                        for item in response['items']:
                            item['snippet']['publishedAt'] = item['snippet'][
                                'publishedAt'].rstrip('Z').replace('T', ' ')
                            item['retrieved_at'] = datetime.utcnow().strftime(
                                "%Y-%m-%d %H:%M:%S.%f")[:-3]
                            json_writer.write("{}\n".format(json.dumps(item)))

        logging.info("Compress file %s", output_json)
        compressed_file = compress(filename=output_json, delete_original=True)

        s3 = boto3.resource('s3')
        s3_filename = "youtube_video_snippet/{}-{}.json.bz2".format(
            datetime.utcnow().strftime("%Y-%m-%d"), num_videos)
        logging.info("Upload file %s to bucket %s at %s", compressed_file,
                     self.s3_data, s3_filename)
        s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename)

        logging.info("Concluded collecting video snippets")
        athena.query_athena_and_wait(
            query_string=CREATE_VIDEO_SNIPPET_JSON.format(
                s3_bucket=self.s3_data))
Пример #19
0
def create_classification_tables(min_users, timespan, related_date,
                                 end_related_date, graph_date_difference):
    graph_date = related_date + timedelta(days=graph_date_difference)
    database = sqlite3.connect('./classification.sqlite')

    sqlite_aws = SqliteAWS(database=database,
                           s3_admin='internet-scholar-admin',
                           s3_data='internet-scholar',
                           athena_db='internet_scholar')

    sqlite_aws.convert_athena_query_to_sqlite(
        table_name='recommendation',
        query=RECOMMENDATION.format(begin_date=str(related_date),
                                    end_date=str(end_related_date)))
    database.execute(CREATE_VIEW_RELATED_VIDEO)
    database.row_factory = sqlite3.Row

    while related_date <= end_related_date:
        print('Classification - related: {related_date} graph: {graph_date}'.
              format(related_date=str(related_date),
                     graph_date=str(graph_date)))
        sqlite_aws.convert_s3_csv_to_sqlite(
            s3_path='youtube_graph_louvain/'
            'min_users={min_users}/'
            'timespan={timespan}/'
            'final_date={graph_date}/louvain.csv.bz2'.format(
                min_users=min_users,
                timespan=timespan,
                graph_date=str(graph_date)))
        database.execute(CREATE_TABLE_YOUTUBE_GRAPH_CLASSIFICATION)
        cursor_related = database.cursor()
        cursor_related.execute(
            SELECT_RELATED_VIDEO.format(related_date=str(related_date)))
        for related_video in cursor_related:
            cursor_related_in_graph = database.cursor()
            cursor_related_in_graph.execute(
                SELECT_RELATED_IN_GRAPH.format(
                    channel_id=related_video['channel_id']))
            exists_in_graph = False
            for related_in_graph in cursor_related_in_graph:
                exists_in_graph = True
                # add IDENTITY record
                database.execute(INSERT_CLASSIFICATION,
                                 (related_in_graph['channel_id'],
                                  related_in_graph['channel_id'], '0.0', '0',
                                  related_in_graph['graph_size'], '1',
                                  related_in_graph['graph_size'], 'IDENTITY'))
                cursor_channel_in_the_same_cluster = database.cursor()
                cursor_channel_in_the_same_cluster.execute(
                    SELECT_CHANNELS_IN_THE_SAME_CLUSTER.format(
                        channel_id=related_video['channel_id']))
                # add KINSHIP record
                for channel_in_the_same_cluster in cursor_channel_in_the_same_cluster:
                    database.execute(
                        INSERT_CLASSIFICATION,
                        (related_in_graph['channel_id'],
                         channel_in_the_same_cluster['channel_id'],
                         channel_in_the_same_cluster['resolution'],
                         channel_in_the_same_cluster['cluster'],
                         channel_in_the_same_cluster['graph_size'],
                         channel_in_the_same_cluster['cluster_size'],
                         channel_in_the_same_cluster['cluster_count'],
                         'KINSHIP'))
                cursor_all_nodes_in_graph = database.cursor()
                cursor_all_nodes_in_graph.execute(SELECT_ALL_NODES_IN_GRAPH)
                # add OPPOSITION record
                for node_in_graph in cursor_all_nodes_in_graph:
                    database.execute(
                        INSERT_CLASSIFICATION,
                        (related_in_graph['channel_id'],
                         node_in_graph['channel_id'], '10.0', '-1',
                         related_in_graph['graph_size'],
                         related_in_graph['graph_size'], '1', 'OPPOSITION'))
            if not exists_in_graph:
                cursor_all_nodes_in_graph = database.cursor()
                cursor_all_nodes_in_graph.execute(SELECT_ALL_NODES_IN_GRAPH)
                # add UNKNOWN records
                for node_in_graph in cursor_all_nodes_in_graph:
                    database.execute(INSERT_CLASSIFICATION,
                                     (related_video['channel_id'],
                                      node_in_graph['channel_id'], '-1', '-1',
                                      '-1', '-1', '-1', 'UNKNOWN'))

        database.execute(
            CREATE_TABLE_YOUTUBE_GRAPH_RELATED.format(
                related_date=str(related_date)))
        database.execute(
            INSERT_RELATED_KNOWN.format(related_date=str(related_date)))
        database.execute(
            INSERT_UNKNOWN_IDENTITY.format(related_date=str(related_date)))
        database.execute(
            INSERT_UNKNOWN_RECOMMENDED.format(related_date=str(related_date)))
        database.execute(
            INSERT_UNKNOWN_TRENDING.format(related_date=str(related_date)))
        database.execute(
            INSERT_UNKNOWN_BOTH.format(related_date=str(related_date)))
        database.commit()

        sqlite_aws.convert_sqlite_to_s3_csv(
            s3_path='youtube_graph_classification/'
            'related_date={related_date}/'
            'graph_date_difference={graph_date_difference}/'
            'min_users={min_users}/'
            'timespan={timespan}/'
            'classification.csv.bz2'.format(
                related_date=str(related_date),
                graph_date_difference=graph_date_difference,
                min_users=min_users,
                timespan=timespan),
            order_by='related_channel_id, graph_channel_id')
        sqlite_aws.convert_sqlite_to_s3_csv(
            s3_path='youtube_graph_related/'
            'related_date={related_date}/'
            'graph_date_difference={graph_date_difference}/'
            'min_users={min_users}/'
            'timespan={timespan}/'
            'related.csv.bz2'.format(
                related_date=str(related_date),
                graph_date_difference=graph_date_difference,
                min_users=min_users,
                timespan=timespan),
            order_by='trending_channel_id, recommended_channel_id')

        database.execute('DROP TABLE youtube_graph_related')
        database.execute('DROP TABLE youtube_graph_classification')
        database.execute('DROP TABLE youtube_graph_louvain')
        database.commit()

        graph_date = graph_date + timedelta(days=1)
        related_date = related_date + timedelta(days=1)

    athena = AthenaDatabase(database='internet_scholar',
                            s3_output='internet-scholar-admin')
    athena.query_athena_and_wait(
        'DROP TABLE IF EXISTS youtube_graph_classification')
    athena.query_athena_and_wait(
        CREATE_ATHENA_TABLE_YOUTUBE_GRAPH_CLASSIFICATION.format(
            s3_data='internet-scholar'))
    athena.query_athena_and_wait(
        'MSCK REPAIR TABLE youtube_graph_classification')
    athena.query_athena_and_wait('DROP TABLE IF EXISTS youtube_graph_related')
    athena.query_athena_and_wait(
        CREATE_ATHENA_TABLE_YOUTUBE_GRAPH_RELATED.format(
            s3_data='internet-scholar'))
    athena.query_athena_and_wait('MSCK REPAIR TABLE youtube_graph_related')

    database.close()
    os.remove('classification.sqlite')
Пример #20
0
    def expand_urls(self, creation_date=None):
        logging.info("begin: expand URLs")
        athena = AthenaDatabase(database=self.athena_data,
                                s3_output=self.s3_admin)

        yesterday = (date.today() - timedelta(days=1)).strftime("%Y-%m-%d")
        if creation_date is None:
            creation_date = yesterday
        logging.info("Expand URLs that were tweeted on {creation_date}".format(
            creation_date=creation_date))

        query_tweet_user_url = self.__TWEET_USER_URL.format(
            creation_date=creation_date)
        query = self.__UNVALIDATED_URLS.format(creation_date=creation_date)
        query_count = self.__COUNT_UNVALIDATED_URLS.format(
            creation_date=creation_date)
        if athena.table_exists("validated_url"):
            logging.info("Table validated_url exists")
            query = query + " and url not in (select validated_url.url from validated_url)"
            query_count = query_count + " and url not in (select validated_url.url from validated_url)"

        logging.info('Update table tweet_user_url')
        tweet_user_url = athena.query_athena_and_download(
            query_string=query_tweet_user_url.format(
                creation_date=creation_date),
            filename=creation_date + '.csv')
        compressed_file = compress(filename=tweet_user_url)
        s3 = boto3.resource('s3')
        s3_filename = "tweet_user_url/creation_date={creation_date}/{code}.csv.bz2".format(
            creation_date=creation_date, code=uuid.uuid4().hex)
        logging.info('Upload data file that will comprise tweet_user_url')
        s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename)

        logging.info('Update table tweet_user_url on Athena')
        logging.info(
            "Create Athena table tweet_user_url if does not exist already")
        athena.query_athena_and_wait(
            query_string=self.__CREATE_TABLE_TWEET_USER_URL.format(
                s3_data=self.s3_data))
        athena.query_athena_and_wait(
            query_string="MSCK REPAIR TABLE tweet_user_url")

        link_count = int(
            athena.query_athena_and_get_result(
                query_string=query_count)['link_count'])
        logging.info("There are %d links to be processed: download them",
                     link_count)
        unvalidated_urls = athena.query_athena_and_download(
            query_string=query, filename="unvalidated_urls.csv")

        with open(unvalidated_urls, newline='') as csv_reader:
            validated_urls = Path(
                Path(__file__).parent, 'tmp', 'validated_urls.csv')
            Path(validated_urls).parent.mkdir(parents=True, exist_ok=True)
            logging.info("Create file %s for validated URLs", validated_urls)
            with open(str(validated_urls), 'w') as csv_writer:
                reader = csv.DictReader(csv_reader)
                writer = csv.DictWriter(csv_writer,
                                        fieldnames=[
                                            'url', 'validated_url',
                                            'status_code', 'content_type',
                                            'content_length', 'created_at'
                                        ],
                                        dialect='unix')
                url_expander = URLExpander()
                num_links = 0
                for url in reader:
                    if num_links % self.LOGGING_INTERVAL == 0:
                        logging.info("%d out of %d links processed", num_links,
                                     link_count)
                    num_links = num_links + 1
                    for expanded_url in url_expander.expand_url(url['url']):
                        writer.writerow(expanded_url)
                logging.info("All links processed")

        logging.info("Compress file %s", validated_urls)
        compressed_file = compress(filename=validated_urls,
                                   delete_original=True)

        if creation_date == yesterday:
            filename_s3 = 'validated_url_raw/{}-{}.csv.bz2'.format(
                time.strftime('%Y-%m-%d-%H-%M-%S', time.gmtime()), link_count)
        else:
            filename_s3 = 'validated_url_raw/{}-{}.csv.bz2'.format(
                creation_date + '-23-59-59', link_count)
        logging.info("Upload file %s to bucket %s at %s", compressed_file,
                     self.s3_data, filename_s3)
        s3.Bucket(self.s3_data).upload_file(str(compressed_file), filename_s3)

        logging.info(
            "Delete previous validated_url data: will be generated again")
        s3.Bucket(
            self.s3_data).objects.filter(Prefix="validated_url/").delete()

        logging.info(
            "Create Athena table validated_url_raw if does not exist already")
        athena.query_athena_and_wait(
            query_string=self.__CREATE_TABLE_VALIDATED_URL_RAW.format(
                s3_data=self.s3_data))
        logging.info("Drop Athena table validated_url")
        athena.query_athena_and_wait(
            query_string="drop table if exists validated_url")
        logging.info("Creates Athena table validated_url through CTAS")
        athena.query_athena_and_wait(
            query_string=self.__CREATE_TABLE_VALIDATED_URL.format(
                s3_data=self.s3_data))
        logging.info("END: expand URLs")