def collect_ancillary_tweets(self, filter_name, method='twint'): athena_db = AthenaDatabase(database=self.athena_data, s3_output=self.s3_admin) filter_terms = athena_db.query_athena_and_get_result( query_string=FILTER_TERMS.format(name=filter_name))['track'] new_videos_yesterday = Path( Path(__file__).parent, 'tmp', 'new_videos_yesterday.csv') Path(new_videos_yesterday).parent.mkdir(parents=True, exist_ok=True) new_videos_yesterday_file = athena_db.query_athena_and_download( query_string=NEW_VIDEOS_YESTERDAY, filename=new_videos_yesterday) yesterday = athena_db.query_athena_and_get_result( query_string=YESTERDAY)['yesterday'] if method == 'twint': self.collect_user_tweets_twint( filter_terms=filter_terms, new_videos_yesterday_file=new_videos_yesterday_file) self.export_twint(yesterday=yesterday) else: self.collect_user_tweets_tweepy( filter_terms=filter_terms, new_videos_yesterday_file=new_videos_yesterday_file) self.export_tweepy(yesterday=yesterday)
def create_edges(min_users, timespan, final_date, end): s3 = boto3.resource('s3') athena_db = AthenaDatabase(database='internet_scholar', s3_output='internet-scholar-admin') min_date = athena_db.query_athena_and_get_result( query_string=MIN_DATE)['min_date'] min_date = datetime.strptime(min_date, '%Y-%m-%d').date() initial_date = final_date - timedelta(days=timespan - 1) while final_date <= end: print('Edges - {}'.format(str(final_date))) if initial_date >= min_date: edges = athena_db.query_athena_and_download( query_string=SELECT_EDGES.format( initial_date=str(initial_date), final_date=str(final_date), min_users=min_users), filename='edges.csv') compressed_file = compress(filename=edges, delete_original=True) s3_filename = "youtube_graph_edge/min_users={min_users}/" \ "timespan={timespan}/final_date={final_date}/edges.csv.bz2".format( min_users=min_users, timespan=timespan, final_date=str(final_date)) s3.Bucket('internet-scholar').upload_file(str(compressed_file), s3_filename) final_date = final_date + timedelta(days=1) initial_date = initial_date + timedelta(days=1) athena_db.query_athena_and_wait( query_string='drop table if exists youtube_graph_edge') athena_db.query_athena_and_wait( query_string=CREATE_YOUTUBE_GRAPH_EDGE.format( s3_data='internet-scholar')) athena_db.query_athena_and_wait( query_string='MSCK REPAIR TABLE youtube_graph_edge')
def update_table_youtube_twitter_addition(self): athena_db = AthenaDatabase(database=self.athena_data, s3_output=self.s3_admin) new_videos_filename = Path( Path(__file__).parent, 'tmp', 'new_videos_today.csv') Path(new_videos_filename).parent.mkdir(parents=True, exist_ok=True) new_videos = athena_db.query_athena_and_download( query_string=NEW_VIDEOS_TODAY, filename=new_videos_filename) new_videos_compressed = Path( Path(__file__).parent, 'tmp', 'new_videos.csv.gz') with open(str(new_videos), 'rt') as f_in: with gzip.open(str(new_videos_compressed), 'wt') as f_out: reader = csv.DictReader(f_in) for video_id in reader: f_out.write(video_id['id'] + '\n') s3 = boto3.resource('s3') s3_filename = "youtube_twitter_addition/creation_date={}/video_ids.csv.gz".format( datetime.utcnow().strftime("%Y-%m-%d")) s3.Bucket(self.s3_data).upload_file(str(new_videos_compressed), s3_filename) athena_db.query_athena_and_wait( query_string="MSCK REPAIR TABLE youtube_twitter_addition")
def repair_table_metadata(self): print("Going to repair table metadata...") athena_db = AthenaDatabase(database=self.config['aws']['athena'], s3_output=self.bucket) athena_db.query_athena_and_wait( query_string="MSCK REPAIR TABLE metadata") self.repair_metadata = False print("Done.")
def add_timeframe_section_to_s3_path(self): athena_db = AthenaDatabase(database=self.config['aws']['athena'], s3_output=self.bucket) # self.repair_table_metadata() # metadata_records = athena_db.query_athena_and_download(query_string="select distinct service, project, " # "speaker, performance_date, speaker_type " # "from metadata " # "order by service, project, speaker, " # "performance_date, speaker_type", # filename='metadata_records.csv') # with open(metadata_records) as metadata_file: # metadata_reader = csv.DictReader(metadata_file) # for row in metadata_reader: # print(f"{row['service']}/{row['project']}/{row['speaker']}/{row['performance_date']}/{row['speaker_type']}") # move_data_in_s3( # bucket_name=self.bucket, # origin=f"transcript/service={row['service']}/project={row['project']}/speaker={row['speaker']}/performance_date={row['performance_date']}/part=1/timeframe=4/timesection=1/speaker_type={row['speaker_type']}/transcript.json.bz2", # destination=f"transcript/service={row['service']}/project={row['project']}/speaker={row['speaker']}/performance_date={row['performance_date']}/part=1/speaker_type={row['speaker_type']}/timeframe=4/section=1/transcript.json.bz2" # ) self.repair_table_word() words_records = athena_db.query_athena_and_download( query_string="select distinct project, speaker, " "performance_date, service, protagonist " "from word " "order by project, speaker, performance_date, " "service, protagonist", filename='words_records.csv') with open(words_records) as words_file: words_reader = csv.DictReader(words_file) for row in words_reader: move_data_in_s3( bucket_name=self.bucket, origin= f"word/project={row['project']}/speaker={row['speaker']}/performance_date={row['performance_date']}/part=1/service={row['service']}/protagonist={row['protagonist']}/word.json.bz2", destination= f"word/project={row['project']}/speaker={row['speaker']}/performance_date={row['performance_date']}/part=1/service={row['service']}/protagonist={row['protagonist']}/timeframe=4/section=1/word.json.bz2" )
def collect_related_video(self, region_code, creation_date=None): athena_db = AthenaDatabase(database=self.athena_data, s3_output=self.s3_admin) trending_filename = Path(Path(__file__).parent, 'tmp', 'trending.csv') Path(trending_filename).parent.mkdir(parents=True, exist_ok=True) if creation_date is None: query_string = TRENDING_VIDEOS.format( creation_date=( date.today() - timedelta(days=self.NUMBER_OF_DAYS)).strftime("%Y-%m-%d"), number_of_videos=self.NUMBER_OF_VIDEOS) else: query_string = TRENDING_VIDEOS.format( creation_date=( datetime.strptime(creation_date, '%Y-%m-%d') - timedelta(days=self.NUMBER_OF_DAYS)).strftime("%Y-%m-%d"), number_of_videos=self.NUMBER_OF_VIDEOS) trending_videos = athena_db.query_athena_and_download( query_string=query_string, filename=trending_filename) with open(trending_videos, newline='', encoding="utf8") as csv_reader: output_json = Path( Path(__file__).parent, 'tmp', 'youtube_related_video.json') Path(output_json).parent.mkdir(parents=True, exist_ok=True) with open(output_json, 'w') as json_writer: reader = csv.DictReader(csv_reader) current_key = 0 try: youtube = googleapiclient.discovery.build( serviceName="youtube", version="v3", developerKey=self.credentials[current_key] ['developer_key'], cache_discovery=False) except UnknownApiNameOrVersion as e: service = read_dict_from_url( url= "https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest" ) youtube = googleapiclient.discovery.build_from_document( service=service, developerKey=self.credentials[current_key] ['developer_key']) num_videos = 0 if creation_date is None: max_results = self.NUMBER_OF_RELATED_VIDEOS part = 'id' else: part = 'snippet' max_results = self.NUMBER_OF_RELATED_VIDEOS * 3 if max_results > 50: max_results = 50 for trending_video in reader: service_unavailable = 0 connection_reset_by_peer = 0 no_response = True response = dict() while no_response: try: response = youtube.search().list( part=part, type='video', regionCode=region_code, relatedToVideoId=trending_video['id'], maxResults=max_results).execute() no_response = False except SocketError as e: if e.errno != errno.ECONNRESET: logging.info("Other socket error!") raise else: connection_reset_by_peer = connection_reset_by_peer + 1 logging.info( "Connection reset by peer! {}".format( connection_reset_by_peer)) if connection_reset_by_peer <= 10: time.sleep( self.WAIT_WHEN_CONNECTION_RESET_BY_PEER ) try: youtube = googleapiclient.discovery.build( serviceName="youtube", version="v3", developerKey=self.credentials[ current_key]['developer_key'], cache_discovery=False) except UnknownApiNameOrVersion as e: service = read_dict_from_url( url= "https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest" ) youtube = googleapiclient.discovery.build_from_document( service=service, developerKey=self.credentials[ current_key]['developer_key']) else: raise except HttpError as e: if "403" in str(e): logging.info( "Invalid {} developer key: {}".format( current_key, self.credentials[current_key] ['developer_key'])) current_key = current_key + 1 if current_key >= len(self.credentials): raise else: try: youtube = googleapiclient.discovery.build( serviceName="youtube", version="v3", developerKey=self.credentials[ current_key]['developer_key'], cache_discovery=False) except UnknownApiNameOrVersion as e: service = read_dict_from_url( url= "https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest" ) youtube = googleapiclient.discovery.build_from_document( service=service, developerKey=self.credentials[ current_key]['developer_key']) elif "Backend Error" in str(e): # Backend errors are usually associated to getting # recommended videos for a video that was deleted by the user. # In that case, just move on. logging.info( "Backend error. Video %s will be ignored", trending_video['id']) no_response = False elif "Not Found" in str(e): # Backend errors are usually associated to getting # recommended videos for a video that was deleted by the user. # In that case, just move on. logging.info( "Not Found error. Video %s will be ignored", trending_video['id']) no_response = False elif "404" in str(e): logging.info( "Requested entity was not found. Video %s will be ignored", trending_video['id']) no_response = False elif "400" in str(e): logging.info( "Invalid argument. Video %s will be ignored", trending_video['id']) no_response = False elif "503" in str(e): logging.info("Service unavailable") service_unavailable = service_unavailable + 1 if service_unavailable <= 10: time.sleep( self.WAIT_WHEN_SERVICE_UNAVAILABLE) else: raise else: raise rank = 1 for item in response.get('items', {}): item['relatedToVideoId'] = trending_video['id'] item['retrieved_at'] = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S.%f")[:-3] item['rank'] = rank if creation_date is None: rank = rank + 1 num_videos = num_videos + 1 json_writer.write("{}\n".format(json.dumps(item))) else: item['snippet']['publishedAt'] = item['snippet'][ 'publishedAt'].rstrip('Z').replace('T', ' ') if rank <= self.NUMBER_OF_RELATED_VIDEOS: if item['snippet'][ 'publishedAt'] <= creation_date + ' 00:00:00.000': rank = rank + 1 num_videos = num_videos + 1 json_writer.write("{}\n".format( json.dumps(item))) logging.info("Compress file %s", output_json) compressed_file = compress(filename=output_json, delete_original=True) s3 = boto3.resource('s3') if creation_date is None: s3_filename = "youtube_related_video/creation_date={creation_date}/{num_videos}.json.bz2".format( creation_date=datetime.utcnow().strftime("%Y-%m-%d"), num_videos=num_videos) else: s3_filename = "youtube_related_video/creation_date={creation_date}/{num_videos}.json.bz2".format( creation_date=creation_date, num_videos=num_videos) logging.info("Upload file %s to bucket %s at %s", compressed_file, self.s3_data, s3_filename) s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename) logging.info("Recreate table for Youtube related video snippets") athena_db.query_athena_and_wait( query_string="DROP TABLE IF EXISTS youtube_related_video") athena_db.query_athena_and_wait( query_string=CREATE_VIDEO_RELATED_JSON.format( s3_bucket=self.s3_data)) athena_db.query_athena_and_wait( query_string="MSCK REPAIR TABLE youtube_related_video") logging.info("Concluded collecting related video snippets")
def collect_video_snippets(self): logging.info("Start collecting video snippets") athena = AthenaDatabase(database=self.athena_data, s3_output=self.s3_admin) if not athena.table_exists("youtube_video_snippet"): select_twitter_stream_video = SELECT_TWITTER_STREAM_VIDEO select_youtube_related_video = SELECT_YOUTUBE_RELATED_VIDEO else: logging.info("Table youtube_video_snippet exists") select_twitter_stream_video = SELECT_TWITTER_STREAM_VIDEO + EXTRA_TWITTER_STREAM_VIDEO select_youtube_related_video = SELECT_YOUTUBE_RELATED_VIDEO + EXTRA_YOUTUBE_RELATED_VIDEO queries = [select_twitter_stream_video] if athena.table_exists("youtube_related_video"): queries.append(select_youtube_related_video) query = " union all ".join(queries) query_count = SELECT_COUNT.format(query) query_group_by = SELECT_GROUP_BY.format(query) logging.info("Download IDs for all Youtube videos that have not been processed yet") video_count = int(athena.query_athena_and_get_result(query_string=query_count)['video_count']) logging.info("There are %d links to be processed: download them", video_count) video_ids_csv = athena.query_athena_and_download(query_string=query_group_by, filename="video_ids.csv") output_json = Path(Path(__file__).parent, 'tmp', 'youtube_video_snippet.json') Path(output_json).parent.mkdir(parents=True, exist_ok=True) current_key = 0 try: youtube = googleapiclient.discovery.build(serviceName="youtube", version="v3", developerKey= self.credentials[current_key]['developer_key'], cache_discovery=False) except UnknownApiNameOrVersion as e: service = read_dict_from_url(url="https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest") youtube = googleapiclient.discovery.build_from_document(service=service, developerKey=self.credentials[current_key][ 'developer_key']) with open(video_ids_csv, newline='') as csv_reader: with open(output_json, 'w') as json_writer: reader = csv.DictReader(csv_reader) num_videos = 0 for video_id in reader: if num_videos % self.LOGGING_INTERVAL == 0: logging.info("%d out of %d videos processed", num_videos, video_count) num_videos = num_videos + 1 connection_reset_by_peer = 0 service_unavailable = 0 no_response = True response = dict() while no_response: try: response = youtube.videos().list(part="snippet",id=video_id['video_id']).execute() no_response = False except SocketError as e: if e.errno != errno.ECONNRESET: logging.info("Other socket error!") raise else: connection_reset_by_peer = connection_reset_by_peer + 1 logging.info("Connection reset by peer! {}".format(connection_reset_by_peer)) if connection_reset_by_peer <= 10: time.sleep(self.WAIT_WHEN_CONNECTION_RESET_BY_PEER) try: youtube = googleapiclient.discovery.build(serviceName="youtube", version="v3", developerKey= self.credentials[current_key][ 'developer_key'], cache_discovery=False) except UnknownApiNameOrVersion as e: service = read_dict_from_url( url="https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest") youtube = googleapiclient.discovery.build_from_document(service=service, developerKey= self.credentials[ current_key][ 'developer_key']) else: raise except HttpError as e: if "403" in str(e): logging.info("Invalid {} developer key: {}".format( current_key, self.credentials[current_key]['developer_key'])) current_key = current_key + 1 if current_key >= len(self.credentials): raise else: try: youtube = googleapiclient.discovery.build(serviceName="youtube", version="v3", developerKey= self.credentials[current_key][ 'developer_key'], cache_discovery=False) except UnknownApiNameOrVersion as e: service = read_dict_from_url( url="https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest") youtube = googleapiclient.discovery.build_from_document(service=service, developerKey= self.credentials[ current_key][ 'developer_key']) elif "503" in str(e): logging.info("Service unavailable") service_unavailable = service_unavailable + 1 if service_unavailable <= 10: time.sleep(self.WAIT_WHEN_SERVICE_UNAVAILABLE) else: raise else: raise if len(response.get('items', [])) == 0: response['id'] = video_id['video_id'] response['retrieved_at'] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] response['description'] = "Video unavailable. It has probably been removed by the user." json_writer.write("{}\n".format(json.dumps(response))) else: for item in response['items']: item['snippet']['publishedAt'] = item['snippet']['publishedAt'].rstrip('Z').replace('T', ' ') item['retrieved_at'] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] json_writer.write("{}\n".format(json.dumps(item))) logging.info("Compress file %s", output_json) compressed_file = compress(filename=output_json, delete_original=True) s3 = boto3.resource('s3') s3_filename = "youtube_video_snippet/creation_date={}/{}-{}.json.bz2".format(datetime.utcnow().strftime("%Y-%m-%d"), uuid.uuid4().hex, num_videos) logging.info("Upload file %s to bucket %s at %s", compressed_file, self.s3_data, s3_filename) s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename) logging.info("Recreate table for Youtube channel stats") athena.query_athena_and_wait(query_string="DROP TABLE IF EXISTS youtube_video_snippet") athena.query_athena_and_wait(query_string=CREATE_VIDEO_SNIPPET_JSON.format(s3_bucket=self.s3_data)) athena.query_athena_and_wait(query_string="MSCK REPAIR TABLE youtube_video_snippet") logging.info("Concluded collecting video snippets")
def collect_channel_stats(self): logging.info("Start collecting Youtube channel stats") channel_ids = Path(Path(__file__).parent, 'tmp', 'channel_ids.csv') athena = AthenaDatabase(database=self.athena_data, s3_output=self.s3_admin) athena.query_athena_and_download(query_string=SELECT_DISTINCT_CHANNEL, filename=channel_ids) channel_count = int( athena.query_athena_and_get_result( query_string=SELECT_COUNT_DISTINCT_CHANNEL)['channel_count']) logging.info("There are %d channels to be processed: download them", channel_count) current_key = 0 youtube = googleapiclient.discovery.build( serviceName="youtube", version="v3", developerKey=self.credentials[current_key]['developer_key'], cache_discovery=False) with open(channel_ids, newline='') as csv_reader: output_json = Path( Path(__file__).parent, 'tmp', 'youtube_channel_stats.json') with open(output_json, 'w') as json_writer: reader = csv.DictReader(csv_reader) num_channels = 0 for channel_id in reader: if num_channels % self.LOGGING_INTERVAL == 0: logging.info("%d out of %d channels processed", num_channels, channel_count) num_channels = num_channels + 1 service_unavailable = 0 no_response = True while no_response: try: response = youtube.channels().list( part="statistics", id=channel_id['channel_id']).execute() no_response = False except HttpError as e: if "403" in str(e): logging.info( "Invalid {} developer key: {}".format( current_key, self.credentials[current_key] ['developer_key'])) current_key = current_key + 1 if current_key >= len(self.credentials): raise else: youtube = googleapiclient.discovery.build( serviceName="youtube", version="v3", developerKey=self.credentials[ current_key]['developer_key'], cache_discovery=False) elif "503" in str(e): logging.info("Service unavailable") service_unavailable = service_unavailable + 1 if service_unavailable <= 10: time.sleep( self.WAIT_WHEN_SERVICE_UNAVAILABLE) else: raise else: raise for item in response.get('items', []): item['retrieved_at'] = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S.%f")[:-3] json_writer.write("{}\n".format(json.dumps(item))) logging.info("Compress file %s", output_json) compressed_file = compress(filename=output_json, delete_original=True) s3 = boto3.resource('s3') s3_filename = "youtube_channel_stats/creation_date={}/{}.json.bz2".format( datetime.utcnow().strftime("%Y-%m-%d"), num_channels) logging.info("Upload file %s to bucket %s at %s", compressed_file, self.s3_data, s3_filename) s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename) logging.info("Recreate table for Youtube channel stats") athena.query_athena_and_wait( query_string="DROP TABLE IF EXISTS youtube_channel_stats") athena.query_athena_and_wait( query_string=CREATE_CHANNEL_STATS_JSON.format( s3_bucket=self.s3_data)) athena.query_athena_and_wait( query_string="MSCK REPAIR TABLE youtube_channel_stats") logging.info("Concluded collecting channel stats")
def inner_retrieve_transcript(self, project, speaker, performance_date, speaker_type, part, timeframe, language, filepath, microsoft, ibm, aws, google): athena_db = AthenaDatabase(database=self.config['aws']['athena'], s3_output=self.bucket) # delete existing sections if microsoft: self.delete_different_timeframe(service='microsoft', project=project, speaker=speaker, performance_date=performance_date, speaker_type=speaker_type, part=part, timeframe=timeframe) if google: self.delete_different_timeframe(service='google', project=project, speaker=speaker, performance_date=performance_date, speaker_type=speaker_type, part=part, timeframe=timeframe) if ibm: self.delete_different_timeframe(service='ibm', project=project, speaker=speaker, performance_date=performance_date, speaker_type=speaker_type, part=part, timeframe=timeframe) if aws: self.delete_different_timeframe(service='aws', project=project, speaker=speaker, performance_date=performance_date, speaker_type=speaker_type, part=part, timeframe=timeframe) if self.repair_metadata: self.repair_table_metadata() # create audio object and slice it according to timeframe (in hours) extension = Path(filepath).suffix[1:] sound = AudioSegment.from_file(filepath, extension) sound = sound.set_channels(1) if (timeframe * 60 * 60) > 13200.0: # more than 3 hours and 40 minutes self.instance_type = 't3a.micro' chunk_length_ms = timeframe * 60 * 60 * 1000 # pydub calculates in millisec chunks = make_chunks(sound, chunk_length_ms) # Make chunks of ten seconds # determine list of jobs that need to be performed jobs = list() for i in range(1, len(chunks) + 1): if microsoft: jobs.append({ 'project': project, 'speaker': speaker, 'performance_date': performance_date, 'speaker_type': speaker_type, 'part': part, 'timeframe': timeframe, 'section': i, 'service': 'microsoft' }) if ibm: jobs.append({ 'project': project, 'speaker': speaker, 'performance_date': performance_date, 'speaker_type': speaker_type, 'part': part, 'timeframe': timeframe, 'section': i, 'service': 'ibm' }) if aws: jobs.append({ 'project': project, 'speaker': speaker, 'performance_date': performance_date, 'speaker_type': speaker_type, 'part': part, 'timeframe': timeframe, 'section': i, 'service': 'aws' }) if google: jobs.append({ 'project': project, 'speaker': speaker, 'performance_date': performance_date, 'speaker_type': speaker_type, 'part': part, 'timeframe': timeframe, 'section': i, 'service': 'google' }) jobs_values = "" for i in range(len(jobs)): jobs_values = f"('{jobs[i]['service']}','{jobs[i]['project']}','{jobs[i]['speaker']}'," \ f"'{jobs[i]['performance_date']}',{jobs[i]['part']},'{jobs[i]['speaker_type']}'," \ f"{jobs[i]['timeframe']},{jobs[i]['section']}),{jobs_values}" jobs_values = jobs_values[:-1] # eliminate the final comma jobs_athena = athena_db.query_athena_and_download( query_string=SELECT_JOBS.format(jobs_values=jobs_values), filename='jobs.csv') with open(jobs_athena) as jobs_file: jobs_reader = csv.DictReader(jobs_file) jobs = list() for row in jobs_reader: jobs.append({ 'service': row['service'], 'project': row['project'], 'speaker': row['speaker'], 'performance_date': row['performance_date'], 'part': row['part'], 'speaker_type': row['speaker_type'], 'timeframe': row['timeframe'], 'section': row['section'] }) # export audio e instantiate cloud transcribers if len(jobs) > 0: created_audio = False Path('./audio/').mkdir(parents=True, exist_ok=True) destination = list() try: # export audio for i, chunk in enumerate(chunks): temp_sound_file = f"./audio/{uuid.uuid4()}.wav" chunk.export(temp_sound_file, format="wav", parameters=['-acodec', 'pcm_s16le']) created_audio = True destination.append(temp_sound_file) # instantiate cloud transcribers for job in jobs: self.instantiate_cloud_transcriber( service=job['service'], project=job['project'], performance_date=job['performance_date'], part=job['part'], timeframe=job['timeframe'], section=job['section'], language=language, speaker=job['speaker'], speaker_type=job['speaker_type'], filepath=destination[int(job['section']) - 1]) # destination is zero-based finally: if created_audio: shutil.rmtree("./audio")
def collect_channel_stats(self): logging.info("Start collecting Youtube channel stats - producer") sqs = boto3.resource('sqs') credentials_queue = sqs.get_queue_by_name( QueueName='youtube_credentials') credentials_queue.purge() logging.info('Cleaned queue for credentials') for credential in self.credentials: credentials_queue.send_message( MessageBody=credential['developer_key']) logging.info('Enqueued credentials') athena = AthenaDatabase(database=self.athena_data, s3_output=self.s3_admin) logging.info("Recreate table for Youtube channel stats") athena.query_athena_and_wait( query_string="DROP TABLE IF EXISTS youtube_channel_stats") athena.query_athena_and_wait( query_string=CREATE_CHANNEL_STATS_JSON.format( s3_bucket=self.s3_data)) athena.query_athena_and_wait( query_string="MSCK REPAIR TABLE youtube_channel_stats") channel_ids = Path(Path(__file__).parent, 'tmp', 'channel_ids.csv') athena.query_athena_and_download(query_string=SELECT_DISTINCT_CHANNEL, filename=channel_ids) channel_count = int( athena.query_athena_and_get_result( query_string=SELECT_COUNT_DISTINCT_CHANNEL)['channel_count']) logging.info("There are %d channels to be processed: download them", channel_count) with open(channel_ids, newline='') as csv_reader: reader = csv.DictReader(csv_reader) channels_queue = sqs.get_queue_by_name( QueueName='youtube_channels') channels_queue.purge() logging.info('Cleaned queue for channels') channels_message = list() for channel_id in reader: channels_message.append(channel_id['channel_id']) if len(channels_message) == self.QUEUING_INTERVAL: channels_queue.send_message( MessageBody=json.dumps(channels_message)) channels_message = list() logging.info("Added %d channels to the queue", self.QUEUING_INTERVAL) if len(channels_message) != 0: channels_queue.send_message( MessageBody=json.dumps(channels_message)) logging.info("Added %d channels to the queue", len(channels_message)) logging.info("Concluded collecting channel stats - producer")
def import_data(related_date, end_related_date, graph_date_difference, timespan): database = sqlite3.connect('./youtube_recommendations.sqlite') sqlite_aws = SqliteAWS(database=database, s3_admin='internet-scholar-admin', s3_data='internet-scholar', athena_db='internet_scholar') logging.info('Retrieve recommendations...') sqlite_aws.convert_athena_query_to_sqlite( table_name='recommendation_aux', query=RECOMMENDATION.format(begin_date=str(related_date), end_date=str(end_related_date))) logging.info('Add primary key to recommendation table...') database.execute(CREATE_TABLE_RECOMMENDATION) database.execute(INSERT_TABLE_RECOMMENDATION) database.execute('DROP TABLE recommendation_aux') logging.info('Update categories and null values...') database.execute(UPDATE_CATEGORY_SEED) database.execute(UPDATE_CATEGORY_RECOMMENDED) database.execute(UPDATE_NULL_SEED) database.execute(UPDATE_NULL_RECOMMENDED) logging.info('Retrieve Twitter users and YouTube channel data...') initial_date = related_date + timedelta( days=graph_date_difference) - timedelta(days=timespan - 1) final_date = end_related_date + timedelta(days=graph_date_difference) sqlite_aws.convert_athena_query_to_sqlite( table_name='twitter_user_channel', query=TWITTER_USER_CHANNEL.format(initial_date=str(initial_date), final_date=str(final_date))) logging.info('Calculate number of common Twitter users per channel...') database.execute(CREATE_YOUTUBE_CHANNEL_COMMON_TWITTER_USERS) current_date = related_date while current_date <= end_related_date: logging.info(str(current_date)) initial_date = current_date + timedelta( days=graph_date_difference) - timedelta(days=timespan - 1) final_date = current_date + timedelta(days=graph_date_difference) database.execute( INSERT_YOUTUBE_CHANNEL_COMMON_TWITTER_USERS.format( initial_date=initial_date, final_date=final_date)) current_date = current_date + timedelta(days=1) logging.info('Update aggregate on SQLite table 1...') database.execute( "ALTER TABLE recommendation ADD COLUMN seed_user_count INT") database.execute(UPDATE_SEED_USER_COUNT) logging.info('Update aggregate on SQLite table 2...') database.execute( "ALTER TABLE recommendation ADD COLUMN recommended_user_count INT") database.execute(UPDATE_RECOMMENDED_USER_COUNT) logging.info('Update aggregate on SQLite table 3...') database.execute( "ALTER TABLE recommendation ADD COLUMN common_user_count INT") database.execute(UPDATE_COMMON_USER_COUNT) logging.info('Retrieve info about political leaning...') sqlite_aws.convert_athena_query_to_sqlite( table_name='channel_political_leaning', query=SELECT_POLITICAL_LEANING.format( initial_date=str(related_date), final_date=str(end_related_date))) logging.info('Update political leaning info on SQLite 1...') database.execute( "ALTER TABLE recommendation ADD COLUMN seed_political_leaning TEXT") database.execute(UPDATE_SEED_POLITICAL_LEANING) logging.info('Update political leaning info on SQLite 1...') database.execute( "ALTER TABLE recommendation ADD COLUMN recommended_political_leaning TEXT" ) database.execute(UPDATE_RECOMMENDED_POLITICAL_LEANING) logging.info('Retrieve data on channel stats...') athena_db = AthenaDatabase(database='internet_scholar', s3_output='internet-scholar-admin') athena_db.query_athena_and_wait( query_string=CREATE_VIEW_ENHANCED_CHANNEL_STATS) sqlite_aws.convert_athena_query_to_sqlite( table_name='channel_stats', query=SELECT_ENHANCED_STATS.format(initial_date=str(related_date), final_date=str(end_related_date))) logging.info('Add primary key to channel stats...') database.execute(CREATE_CHANNEL_STATS_WITH_PRIMARY_KEY) database.execute(INSERT_CHANNEL_STATS_WITH_PRIMARY_KEY) add_stat_to_sqlite(database, field='view_count') add_stat_to_sqlite(database, field='cumulative_view_count') add_stat_to_sqlite(database, field='subscriber_count') add_stat_to_sqlite(database, field='cumulative_subscriber_count') add_stat_to_sqlite(database, field='video_count') add_stat_to_sqlite(database, field='cumulative_video_count') add_stat_to_sqlite(database, field='comment_count') add_stat_to_sqlite(database, field='cumulative_comment_count') database.execute('DROP TABLE channel_political_leaning') database.execute('DROP TABLE channel_stats') database.execute('DROP TABLE channel_stats_with_primary_key') database.execute('DROP TABLE twitter_user_channel') database.execute('DROP TABLE youtube_channel_common_twitter_users') database.commit() database.execute('VACUUM') database.close() new_filename = compress('./youtube_recommendations.sqlite') s3_filename = "youtube_data_export_r/{timestamp}.sqlite.bz2".format( timestamp=datetime.utcnow().strftime("%Y%m%d-%H%M%S")) s3 = boto3.resource('s3') s3.Bucket('internet-scholar').upload_file(str(new_filename), s3_filename)
def export_google_sheets(self, project=None, speaker=None, interval_in_seconds=10): self.parse_words(project=project, speaker=speaker) Path("./csv/").mkdir(parents=True, exist_ok=True) print("Export CSVs...") try: json_string = json.dumps(self.config['google']) Path('./local_credentials/').mkdir(parents=True, exist_ok=True) temp_file = f"./local_credentials/{uuid.uuid4()}.json" with open(temp_file, 'w', encoding="utf-8") as json_file: json_file.write(json_string) try: credentials_google_drive = service_account.Credentials.from_service_account_file( temp_file, scopes=['https://www.googleapis.com/auth/drive']) credentials_google_sheets = service_account.Credentials.from_service_account_file( temp_file, scopes=['https://www.googleapis.com/auth/spreadsheets']) finally: shutil.rmtree('./local_credentials') google_drive = build('drive', 'v3', credentials=credentials_google_drive) google_sheets = build('sheets', 'v4', credentials=credentials_google_sheets) athena_db = AthenaDatabase(database=self.config['aws']['athena'], s3_output=self.bucket) all_projects = athena_db.query_athena_and_download( query_string=SELECT_ALL_PROJECTS.format( where_clause=self.get_where_clause(project=project, speaker=speaker)), filename='selected_all_projects.csv') with open(all_projects) as all_projects_csv: projects_reader = csv.DictReader(all_projects_csv) for projects_row in projects_reader: response_project = google_drive.files().list( q=f"mimeType='application/vnd.google-apps.folder' and " f"'{self.config['google']['transcription_folder']}' in parents and " f"name='{projects_row['project']}'", spaces='drive', fields='files(id, name)').execute() if len(response_project['files']) == 0: folder_metadata = { 'name': projects_row['project'], 'mimeType': 'application/vnd.google-apps.folder', 'parents': [ self.config['google']['transcription_folder'], ] } project_folder = google_drive.files().create( body=folder_metadata, fields='id').execute() project_id = project_folder['id'] elif len(response_project['files']) == 1: project_id = response_project['files'][0]['id'] else: raise Exception( "Error! Should not have more than 1 folder for this project!" ) all_speakers = athena_db.query_athena_and_download( query_string=SELECT_ALL_SPEAKERS.format( where_clause=self.get_where_clause( project=projects_row['project'], speaker=speaker)), filename='selected_all_speakers.csv') with open(all_speakers) as all_speakers_csv: speakers_reader = csv.DictReader(all_speakers_csv) for speakers_row in speakers_reader: response_spreadsheet = google_drive.files().list( q= f"mimeType='application/vnd.google-apps.spreadsheet' and '{project_id}' in parents and name='{speakers_row['speaker']}'", spaces='drive', fields='files(id, name)').execute() if len(response_spreadsheet['files']) == 1: print( f"Spreadsheet for {speakers_row['speaker']} already exists. I will not overwrite." ) elif len(response_spreadsheet['files']) >= 2: raise Exception( "Error! Should not have more than 1 spreadsheet for this project!" ) else: # it is 0 body = { 'mimeType': 'application/vnd.google-apps.spreadsheet', 'name': speakers_row['speaker'], 'parents': [ project_id, ] } response = google_drive.files().create( body=body, fields='id').execute() speaker_id = response['id'] all_parts = athena_db.query_athena_and_download( query_string=SELECT_ALL_PARTS.format( where_clause=self.get_where_clause( project=projects_row['project'], speaker=speakers_row['speaker'])), filename='selected_all_parts.csv') with open(all_parts) as all_parts_csv: parts_reader = csv.DictReader( all_parts_csv) first_sheet = True for parts_row in parts_reader: filename = f"{projects_row['project']}_{speakers_row['speaker']}_" \ f"{parts_row['performance_date']}_{parts_row['part']}_{interval_in_seconds}.csv" print(filename) new_file = athena_db.query_athena_and_download( SELECT_TRANSCRIPT.format( project=projects_row[ 'project'], speaker=speakers_row[ 'speaker'], performance_date=parts_row[ 'performance_date'], part=parts_row['part'], interval_in_seconds= interval_in_seconds), filename) os.replace(new_file, f'./csv/{filename}') if first_sheet: body = { 'requests': { "updateSheetProperties": { "fields": "title,gridProperties.rowCount,gridProperties.columnCount,gridProperties.frozenRowCount", "properties": { "title": f"{parts_row['performance_date']} / {parts_row['part']}", "gridProperties": { "rowCount": 3, "columnCount": 3, "frozenRowCount": 1 }, "index": 0 } } }, 'includeSpreadsheetInResponse': True } response = google_sheets.spreadsheets( ).batchUpdate( spreadsheetId=speaker_id, body=body).execute() sheet_id = response[ 'updatedSpreadsheet'][ 'sheets'][0]['properties'][ 'sheetId'] first_sheet = False else: body = { "requests": { "addSheet": { "properties": { "title": f"{parts_row['performance_date']} / {parts_row['part']}", "gridProperties": { "rowCount": 3, "columnCount": 3, "frozenRowCount": 1 } } } } } response = google_sheets.spreadsheets( ).batchUpdate( spreadsheetId=speaker_id, body=body).execute() sheet_id = response['replies'][0][ 'addSheet']['properties'][ 'sheetId'] with open( f'./csv/{filename}', 'r', encoding="utf-8") as csv_file: csvContents = csv_file.read() body = { 'requests': [{ 'pasteData': { "coordinate": { "sheetId": sheet_id, "rowIndex": "0", # adapt this if you need different positioning "columnIndex": "0", # adapt this if you need different positioning }, "data": csvContents, "type": 'PASTE_NORMAL', "delimiter": ',', } }] } response = google_sheets.spreadsheets( ).batchUpdate(spreadsheetId=speaker_id, body=body).execute() body = { "requests": [{ "repeatCell": { "range": { "sheetId": sheet_id, "startRowIndex": 0, "startColumnIndex": 0 }, "cell": { "userEnteredFormat": { "verticalAlignment": "TOP", "wrapStrategy": "WRAP" }, }, "fields": "userEnteredFormat.wrapStrategy,userEnteredFormat.verticalAlignment" } }, { "updateDimensionProperties": { "range": { "sheetId": sheet_id, "dimension": "COLUMNS", "startIndex": 0, "endIndex": 1 }, "properties": { "pixelSize": 60 }, "fields": "pixelSize" } }, { "updateDimensionProperties": { "range": { "sheetId": sheet_id, "dimension": "COLUMNS", "startIndex": 1 }, "properties": { "pixelSize": 280 }, "fields": "pixelSize" } }] } response = google_sheets.spreadsheets( ).batchUpdate(spreadsheetId=speaker_id, body=body).execute() finally: shutil.rmtree('./csv')
def parse_words(self, project=None, speaker=None, performance_date=None, part=None): self.repair_table_metadata() self.repair_table_word() where_clause = self.get_where_clause(project=project, speaker=speaker, performance_date=performance_date, part=part) if where_clause == '': select = SELECT_NON_PARSED_TRANSCRIPTS.format( where_clause=where_clause).replace(' and ', ' where ', 1).replace('\n\n', '\n') else: select = SELECT_NON_PARSED_TRANSCRIPTS.format( where_clause=where_clause) athena_db = AthenaDatabase(database=self.config['aws']['athena'], s3_output=self.bucket) unparsed_records = athena_db.query_athena_and_download( query_string=select, filename='unparsed_records.csv') with open(unparsed_records) as unparsed_file: reader = csv.DictReader(unparsed_file) database_has_changed = False try: print("Parse words...") for row in reader: print( f"{row['speaker']}_{row['performance_date']}_{row['part']}_{row['service']}_{row['speaker_type']}_{row['section']}" ) transcript = read_dict_from_s3( self.bucket, f"transcript/service={row['service']}/project={row['project']}/speaker={row['speaker']}/" f"performance_date={row['performance_date']}/part={row['part']}/" f"speaker_type={row['speaker_type']}/timeframe={row['timeframe']}/" f"section={row['section']}/transcript.json.bz2", compressed=True) protagonist_words, non_protagonist_words = parse_words( transcript=transcript, speaker_type=row['speaker_type'], service=row['service']) partitions = OrderedDict() partitions['project'] = row['project'] partitions['speaker'] = row['speaker'] partitions['performance_date'] = row['performance_date'] partitions['part'] = row['part'] partitions['service'] = row['service'] partitions['protagonist'] = -1 partitions['timeframe'] = row['timeframe'] partitions['section'] = row['section'] if len(protagonist_words) > 0: partitions['protagonist'] = 1 save_data_in_s3(content=protagonist_words, s3_bucket=self.bucket, s3_key='word.json', prefix='word', partitions=partitions) if len(non_protagonist_words) > 0: partitions['protagonist'] = 0 save_data_in_s3(content=non_protagonist_words, s3_bucket=self.bucket, s3_key='word.json', prefix='word', partitions=partitions) database_has_changed = True finally: if database_has_changed: self.repair_table_word()
def repair_table_word(self): print("Going to repair table word...") athena_db = AthenaDatabase(database=self.config['aws']['athena'], s3_output=self.bucket) athena_db.query_athena_and_wait(query_string="MSCK REPAIR TABLE word") print("Done.")
def export_tweepy(self, yesterday): json_file = Path(Path(__file__).parent, 'tmp', 'tweepy_video_id.json') self.create_json_tweepy_file(source="tweet_from_video_id", destination=json_file) json_video_id_file_compressed = compress(json_file) json_file = Path( Path(__file__).parent, 'tmp', 'tweepy_user_screen.json') self.create_json_tweepy_file(source="tweet_from_screen_name", destination=json_file) json_screen_name_file_compressed = compress(json_file) s3 = boto3.resource('s3') s3_filename = "tweepy_video_id/reference_date={}/tweepy_from_video_id.json.bz2".format( yesterday) s3.Bucket(self.s3_data).upload_file(str(json_video_id_file_compressed), s3_filename) s3_filename = "tweepy_screen_name/reference_date={}/tweepy_from_screen_name.json.bz2".format( yesterday) s3.Bucket(self.s3_data).upload_file( str(json_screen_name_file_compressed), s3_filename) athena_db = AthenaDatabase(database=self.athena_data, s3_output=self.s3_admin) athena_db.query_athena_and_wait( query_string="DROP TABLE tweepy_video_id") athena_db.query_athena_and_wait( query_string=ATHENA_CREATE_TWEEPY_VIDEO_ID.format( structure=STRUCTURE_TWEEPY_ATHENA, s3_bucket=self.s3_data)) athena_db.query_athena_and_wait( query_string="MSCK REPAIR TABLE tweepy_video_id") athena_db.query_athena_and_wait( query_string="DROP TABLE tweepy_screen_name") athena_db.query_athena_and_wait( query_string=ATHENA_CREATE_TWEEPY_SCREEN_NAME.format( structure=STRUCTURE_TWEEPY_ATHENA, s3_bucket=self.s3_data)) athena_db.query_athena_and_wait( query_string="MSCK REPAIR TABLE tweepy_screen_name")
# This script was used when I decided to create a table called tweet_user_url to unnest the URLs in each Tweet. # To make backward compatibility possible I had to create the data files for previous dates (< 2019-11-28). from internet_scholar import AthenaDatabase, compress import boto3 from datetime import date, timedelta import uuid current_date = date(2019, 8, 15) athena_db = AthenaDatabase(database='internet_scholar', s3_output='internet-scholar-admin') query = """ select twitter_stream.id_str as tweet_id, twitter_stream.user.id_str as user_id, url.expanded_url as url from internet_scholar.twitter_stream_raw as twitter_stream, unnest(entities.urls) as t(url) where creation_date = '{creation_date}' and url.display_url not like 'twitter.com/%' order by tweet_id, user_id, url; """ while current_date <= date(2019, 11, 27): print(str(current_date)) tweet_user_url = athena_db.query_athena_and_download(
def create_louvain(min_users, timespan, final_date, end): s3 = boto3.resource('s3') athena_db = AthenaDatabase(database='internet_scholar', s3_output='internet-scholar-admin') while final_date <= end: print('Louvain - {}'.format(str(final_date))) edges = athena_db.query_athena_and_download( query_string=EDGES_LOUVAIN.format(final_date=str(final_date), min_users=min_users, timespan=timespan), filename='edges_louvain.csv') g = nx.Graph() with open(edges, newline='', encoding="utf8") as csv_reader: reader = csv.DictReader(csv_reader) for edge in reader: g.add_edge(edge['source_id'], edge['target_id'], weight=int(edge['weight'])) with open('./louvain.csv', 'w', encoding="utf8") as csv_writer: writer = csv.DictWriter(csv_writer, fieldnames=[ 'resolution', 'channel_id', 'cluster', 'graph_size', 'cluster_size', 'cluster_count' ], dialect='unix') writer.writeheader() nodes = list(g) graph_size = len(nodes) for resolution in numpy.arange(10, 0, -0.1): partition = community.best_partition(g, resolution=resolution, randomize=False) cluster_count = len(set(partition.values())) for partition_number in set(partition.values()): new_partition = list() for channel_id in partition.keys(): if partition[channel_id] == partition_number: new_partition.append(channel_id) cluster_size = len(new_partition) new_partition_number = nodes.index(min(new_partition)) for item in new_partition: new_record = dict() new_record['resolution'] = "{:.1f}".format(resolution) new_record['channel_id'] = item new_record['cluster'] = new_partition_number new_record['graph_size'] = graph_size new_record['cluster_size'] = cluster_size new_record['cluster_count'] = cluster_count writer.writerow(new_record) compressed_file = compress(filename='./louvain.csv', delete_original=True) s3_filename = "youtube_graph_louvain/min_users={min_users}/" \ "timespan={timespan}/final_date={final_date}/louvain.csv.bz2".format( min_users=min_users, timespan=timespan, final_date=str(final_date)) s3.Bucket('internet-scholar').upload_file(str(compressed_file), s3_filename) final_date = final_date + timedelta(days=1) athena_db.query_athena_and_wait( query_string='drop table if exists youtube_graph_louvain') athena_db.query_athena_and_wait( query_string=CREATE_YOUTUBE_GRAPH_LOUVAIN.format( s3_data='internet-scholar')) athena_db.query_athena_and_wait( query_string='MSCK REPAIR TABLE youtube_graph_louvain')
def collect_video_snippets(self): logging.info("Start collecting video snippets") athena = AthenaDatabase(database=self.athena_data, s3_output=self.s3_admin) query = SELECT_YOUTUBE_VIDEOS query_count = SELECT_COUNT_YOUTUBE_VIDEOS if athena.table_exists("youtube_video_snippet"): logging.info("Table youtube_video_snippet exists") query = query + TABLE_YOUTUBE_VIDEO_SNIPPET_EXISTS query_count = query_count + TABLE_YOUTUBE_VIDEO_SNIPPET_EXISTS logging.info( "Download IDs for all Youtube videos that have not been processed yet" ) video_count = int( athena.query_athena_and_get_result( query_string=query_count)['video_count']) logging.info("There are %d links to be processed: download them", video_count) video_ids_csv = athena.query_athena_and_download( query_string=query, filename="video_ids.csv") output_json = Path( Path(__file__).parent, 'tmp', 'youtube_video_snippet.json') Path(output_json).parent.mkdir(parents=True, exist_ok=True) current_key = 0 youtube = googleapiclient.discovery.build( serviceName="youtube", version="v3", developerKey=self.credentials[current_key]['developer_key'], cache_discovery=False) with open(video_ids_csv, newline='') as csv_reader: with open(output_json, 'w') as json_writer: reader = csv.DictReader(csv_reader) num_videos = 0 for video_id in reader: if num_videos % self.LOGGING_INTERVAL == 0: logging.info("%d out of %d videos processed", num_videos, video_count) num_videos = num_videos + 1 service_unavailable = 0 no_response = True while no_response: try: response = youtube.videos().list( part="snippet", id=video_id['video_id']).execute() no_response = False except HttpError as e: if "403" in str(e): logging.info( "Invalid {} developer key: {}".format( current_key, self.credentials[current_key] ['developer_key'])) current_key = current_key + 1 if current_key >= len(self.credentials): raise else: youtube = googleapiclient.discovery.build( serviceName="youtube", version="v3", developerKey=self.credentials[ current_key]['developer_key'], cache_discovery=False) elif "503" in str(e): logging.info("Service unavailable") service_unavailable = service_unavailable + 1 if service_unavailable <= 10: time.sleep( self.WAIT_WHEN_SERVICE_UNAVAILABLE) else: raise else: raise if len(response.get('items', [])) == 0: response['id'] = video_id['video_id'] response['retrieved_at'] = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S.%f")[:-3] response[ 'description'] = "Video unavailable. It has probably been removed by the user." json_writer.write("{}\n".format(json.dumps(response))) else: for item in response['items']: item['snippet']['publishedAt'] = item['snippet'][ 'publishedAt'].rstrip('Z').replace('T', ' ') item['retrieved_at'] = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S.%f")[:-3] json_writer.write("{}\n".format(json.dumps(item))) logging.info("Compress file %s", output_json) compressed_file = compress(filename=output_json, delete_original=True) s3 = boto3.resource('s3') s3_filename = "youtube_video_snippet/{}-{}.json.bz2".format( datetime.utcnow().strftime("%Y-%m-%d"), num_videos) logging.info("Upload file %s to bucket %s at %s", compressed_file, self.s3_data, s3_filename) s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename) logging.info("Concluded collecting video snippets") athena.query_athena_and_wait( query_string=CREATE_VIDEO_SNIPPET_JSON.format( s3_bucket=self.s3_data))
def create_classification_tables(min_users, timespan, related_date, end_related_date, graph_date_difference): graph_date = related_date + timedelta(days=graph_date_difference) database = sqlite3.connect('./classification.sqlite') sqlite_aws = SqliteAWS(database=database, s3_admin='internet-scholar-admin', s3_data='internet-scholar', athena_db='internet_scholar') sqlite_aws.convert_athena_query_to_sqlite( table_name='recommendation', query=RECOMMENDATION.format(begin_date=str(related_date), end_date=str(end_related_date))) database.execute(CREATE_VIEW_RELATED_VIDEO) database.row_factory = sqlite3.Row while related_date <= end_related_date: print('Classification - related: {related_date} graph: {graph_date}'. format(related_date=str(related_date), graph_date=str(graph_date))) sqlite_aws.convert_s3_csv_to_sqlite( s3_path='youtube_graph_louvain/' 'min_users={min_users}/' 'timespan={timespan}/' 'final_date={graph_date}/louvain.csv.bz2'.format( min_users=min_users, timespan=timespan, graph_date=str(graph_date))) database.execute(CREATE_TABLE_YOUTUBE_GRAPH_CLASSIFICATION) cursor_related = database.cursor() cursor_related.execute( SELECT_RELATED_VIDEO.format(related_date=str(related_date))) for related_video in cursor_related: cursor_related_in_graph = database.cursor() cursor_related_in_graph.execute( SELECT_RELATED_IN_GRAPH.format( channel_id=related_video['channel_id'])) exists_in_graph = False for related_in_graph in cursor_related_in_graph: exists_in_graph = True # add IDENTITY record database.execute(INSERT_CLASSIFICATION, (related_in_graph['channel_id'], related_in_graph['channel_id'], '0.0', '0', related_in_graph['graph_size'], '1', related_in_graph['graph_size'], 'IDENTITY')) cursor_channel_in_the_same_cluster = database.cursor() cursor_channel_in_the_same_cluster.execute( SELECT_CHANNELS_IN_THE_SAME_CLUSTER.format( channel_id=related_video['channel_id'])) # add KINSHIP record for channel_in_the_same_cluster in cursor_channel_in_the_same_cluster: database.execute( INSERT_CLASSIFICATION, (related_in_graph['channel_id'], channel_in_the_same_cluster['channel_id'], channel_in_the_same_cluster['resolution'], channel_in_the_same_cluster['cluster'], channel_in_the_same_cluster['graph_size'], channel_in_the_same_cluster['cluster_size'], channel_in_the_same_cluster['cluster_count'], 'KINSHIP')) cursor_all_nodes_in_graph = database.cursor() cursor_all_nodes_in_graph.execute(SELECT_ALL_NODES_IN_GRAPH) # add OPPOSITION record for node_in_graph in cursor_all_nodes_in_graph: database.execute( INSERT_CLASSIFICATION, (related_in_graph['channel_id'], node_in_graph['channel_id'], '10.0', '-1', related_in_graph['graph_size'], related_in_graph['graph_size'], '1', 'OPPOSITION')) if not exists_in_graph: cursor_all_nodes_in_graph = database.cursor() cursor_all_nodes_in_graph.execute(SELECT_ALL_NODES_IN_GRAPH) # add UNKNOWN records for node_in_graph in cursor_all_nodes_in_graph: database.execute(INSERT_CLASSIFICATION, (related_video['channel_id'], node_in_graph['channel_id'], '-1', '-1', '-1', '-1', '-1', 'UNKNOWN')) database.execute( CREATE_TABLE_YOUTUBE_GRAPH_RELATED.format( related_date=str(related_date))) database.execute( INSERT_RELATED_KNOWN.format(related_date=str(related_date))) database.execute( INSERT_UNKNOWN_IDENTITY.format(related_date=str(related_date))) database.execute( INSERT_UNKNOWN_RECOMMENDED.format(related_date=str(related_date))) database.execute( INSERT_UNKNOWN_TRENDING.format(related_date=str(related_date))) database.execute( INSERT_UNKNOWN_BOTH.format(related_date=str(related_date))) database.commit() sqlite_aws.convert_sqlite_to_s3_csv( s3_path='youtube_graph_classification/' 'related_date={related_date}/' 'graph_date_difference={graph_date_difference}/' 'min_users={min_users}/' 'timespan={timespan}/' 'classification.csv.bz2'.format( related_date=str(related_date), graph_date_difference=graph_date_difference, min_users=min_users, timespan=timespan), order_by='related_channel_id, graph_channel_id') sqlite_aws.convert_sqlite_to_s3_csv( s3_path='youtube_graph_related/' 'related_date={related_date}/' 'graph_date_difference={graph_date_difference}/' 'min_users={min_users}/' 'timespan={timespan}/' 'related.csv.bz2'.format( related_date=str(related_date), graph_date_difference=graph_date_difference, min_users=min_users, timespan=timespan), order_by='trending_channel_id, recommended_channel_id') database.execute('DROP TABLE youtube_graph_related') database.execute('DROP TABLE youtube_graph_classification') database.execute('DROP TABLE youtube_graph_louvain') database.commit() graph_date = graph_date + timedelta(days=1) related_date = related_date + timedelta(days=1) athena = AthenaDatabase(database='internet_scholar', s3_output='internet-scholar-admin') athena.query_athena_and_wait( 'DROP TABLE IF EXISTS youtube_graph_classification') athena.query_athena_and_wait( CREATE_ATHENA_TABLE_YOUTUBE_GRAPH_CLASSIFICATION.format( s3_data='internet-scholar')) athena.query_athena_and_wait( 'MSCK REPAIR TABLE youtube_graph_classification') athena.query_athena_and_wait('DROP TABLE IF EXISTS youtube_graph_related') athena.query_athena_and_wait( CREATE_ATHENA_TABLE_YOUTUBE_GRAPH_RELATED.format( s3_data='internet-scholar')) athena.query_athena_and_wait('MSCK REPAIR TABLE youtube_graph_related') database.close() os.remove('classification.sqlite')
def expand_urls(self, creation_date=None): logging.info("begin: expand URLs") athena = AthenaDatabase(database=self.athena_data, s3_output=self.s3_admin) yesterday = (date.today() - timedelta(days=1)).strftime("%Y-%m-%d") if creation_date is None: creation_date = yesterday logging.info("Expand URLs that were tweeted on {creation_date}".format( creation_date=creation_date)) query_tweet_user_url = self.__TWEET_USER_URL.format( creation_date=creation_date) query = self.__UNVALIDATED_URLS.format(creation_date=creation_date) query_count = self.__COUNT_UNVALIDATED_URLS.format( creation_date=creation_date) if athena.table_exists("validated_url"): logging.info("Table validated_url exists") query = query + " and url not in (select validated_url.url from validated_url)" query_count = query_count + " and url not in (select validated_url.url from validated_url)" logging.info('Update table tweet_user_url') tweet_user_url = athena.query_athena_and_download( query_string=query_tweet_user_url.format( creation_date=creation_date), filename=creation_date + '.csv') compressed_file = compress(filename=tweet_user_url) s3 = boto3.resource('s3') s3_filename = "tweet_user_url/creation_date={creation_date}/{code}.csv.bz2".format( creation_date=creation_date, code=uuid.uuid4().hex) logging.info('Upload data file that will comprise tweet_user_url') s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename) logging.info('Update table tweet_user_url on Athena') logging.info( "Create Athena table tweet_user_url if does not exist already") athena.query_athena_and_wait( query_string=self.__CREATE_TABLE_TWEET_USER_URL.format( s3_data=self.s3_data)) athena.query_athena_and_wait( query_string="MSCK REPAIR TABLE tweet_user_url") link_count = int( athena.query_athena_and_get_result( query_string=query_count)['link_count']) logging.info("There are %d links to be processed: download them", link_count) unvalidated_urls = athena.query_athena_and_download( query_string=query, filename="unvalidated_urls.csv") with open(unvalidated_urls, newline='') as csv_reader: validated_urls = Path( Path(__file__).parent, 'tmp', 'validated_urls.csv') Path(validated_urls).parent.mkdir(parents=True, exist_ok=True) logging.info("Create file %s for validated URLs", validated_urls) with open(str(validated_urls), 'w') as csv_writer: reader = csv.DictReader(csv_reader) writer = csv.DictWriter(csv_writer, fieldnames=[ 'url', 'validated_url', 'status_code', 'content_type', 'content_length', 'created_at' ], dialect='unix') url_expander = URLExpander() num_links = 0 for url in reader: if num_links % self.LOGGING_INTERVAL == 0: logging.info("%d out of %d links processed", num_links, link_count) num_links = num_links + 1 for expanded_url in url_expander.expand_url(url['url']): writer.writerow(expanded_url) logging.info("All links processed") logging.info("Compress file %s", validated_urls) compressed_file = compress(filename=validated_urls, delete_original=True) if creation_date == yesterday: filename_s3 = 'validated_url_raw/{}-{}.csv.bz2'.format( time.strftime('%Y-%m-%d-%H-%M-%S', time.gmtime()), link_count) else: filename_s3 = 'validated_url_raw/{}-{}.csv.bz2'.format( creation_date + '-23-59-59', link_count) logging.info("Upload file %s to bucket %s at %s", compressed_file, self.s3_data, filename_s3) s3.Bucket(self.s3_data).upload_file(str(compressed_file), filename_s3) logging.info( "Delete previous validated_url data: will be generated again") s3.Bucket( self.s3_data).objects.filter(Prefix="validated_url/").delete() logging.info( "Create Athena table validated_url_raw if does not exist already") athena.query_athena_and_wait( query_string=self.__CREATE_TABLE_VALIDATED_URL_RAW.format( s3_data=self.s3_data)) logging.info("Drop Athena table validated_url") athena.query_athena_and_wait( query_string="drop table if exists validated_url") logging.info("Creates Athena table validated_url through CTAS") athena.query_athena_and_wait( query_string=self.__CREATE_TABLE_VALIDATED_URL.format( s3_data=self.s3_data)) logging.info("END: expand URLs")