def create_edges(min_users, timespan, final_date, end): s3 = boto3.resource('s3') athena_db = AthenaDatabase(database='internet_scholar', s3_output='internet-scholar-admin') min_date = athena_db.query_athena_and_get_result( query_string=MIN_DATE)['min_date'] min_date = datetime.strptime(min_date, '%Y-%m-%d').date() initial_date = final_date - timedelta(days=timespan - 1) while final_date <= end: print('Edges - {}'.format(str(final_date))) if initial_date >= min_date: edges = athena_db.query_athena_and_download( query_string=SELECT_EDGES.format( initial_date=str(initial_date), final_date=str(final_date), min_users=min_users), filename='edges.csv') compressed_file = compress(filename=edges, delete_original=True) s3_filename = "youtube_graph_edge/min_users={min_users}/" \ "timespan={timespan}/final_date={final_date}/edges.csv.bz2".format( min_users=min_users, timespan=timespan, final_date=str(final_date)) s3.Bucket('internet-scholar').upload_file(str(compressed_file), s3_filename) final_date = final_date + timedelta(days=1) initial_date = initial_date + timedelta(days=1) athena_db.query_athena_and_wait( query_string='drop table if exists youtube_graph_edge') athena_db.query_athena_and_wait( query_string=CREATE_YOUTUBE_GRAPH_EDGE.format( s3_data='internet-scholar')) athena_db.query_athena_and_wait( query_string='MSCK REPAIR TABLE youtube_graph_edge')
def export_twint(self, yesterday): tweet_from_video_id = Path( Path(__file__).parent, 'tmp', 'tweet_from_video_id.sqlite') json_video_id_file = Path( Path(__file__).parent, 'tmp', 'twint_from_video_id.json') self.create_json_twint_file(source=tweet_from_video_id, destination=json_video_id_file) json_video_id_file_compressed = compress(json_video_id_file) tweet_from_screen_name = Path( Path(__file__).parent, 'tmp', 'tweet_from_screen_name.sqlite') json_screen_name_file = Path( Path(__file__).parent, 'tmp', 'twint_from_screen_name.json') self.create_json_twint_file(source=tweet_from_screen_name, destination=json_screen_name_file) json_screen_name_file_compressed = compress(json_screen_name_file) s3 = boto3.resource('s3') s3_filename = "twint_video_id/reference_date={}/twint_from_video_id.json.bz2".format( yesterday) s3.Bucket(self.s3_data).upload_file(str(json_video_id_file_compressed), s3_filename) s3_filename = "twint_screen_name/reference_date={}/twint_from_screen_name.json.bz2".format( yesterday) s3.Bucket(self.s3_data).upload_file( str(json_screen_name_file_compressed), s3_filename) athena_db = AthenaDatabase(database=self.athena_data, s3_output=self.s3_admin) athena_db.query_athena_and_wait( query_string="DROP TABLE twint_video_id") athena_db.query_athena_and_wait( query_string=ATHENA_CREATE_TWINT_VIDEO_ID.format( structure=STRUCTURE_TWINT_ATHENA, s3_bucket=self.s3_data)) athena_db.query_athena_and_wait( query_string="MSCK REPAIR TABLE twint_video_id") athena_db.query_athena_and_wait( query_string="DROP TABLE twint_screen_name") athena_db.query_athena_and_wait( query_string=ATHENA_CREATE_TWINT_SCREEN_NAME.format( structure=STRUCTURE_TWINT_ATHENA, s3_bucket=self.s3_data)) athena_db.query_athena_and_wait( query_string="MSCK REPAIR TABLE twint_screen_name")
def create_gexf(min_users, timespan, final_date, end): s3 = boto3.resource('s3') while final_date <= end: print('GEXF - {}'.format(str(final_date))) gexf = Element( 'gexf', { 'xmlns': "http://www.gexf.net/1.3", 'version': "1.3", 'xmlns:viz': "http://www.gexf.net/1.3/viz", 'xmlns:xsi': "http://www.w3.org/2001/XMLSchema-instance", 'xsi:schemaLocation': "http://www.gexf.net/1.3 http://www.gexf.net/1.3/gexf.xsd" }) graph = SubElement( gexf, 'graph', { 'mode': "dynamic", 'defaultedgetype': "undirected", 'timeformat': "double", 'timerepresentation': "timestamp" }) attributes = SubElement(graph, 'attributes', { 'class': "node", 'mode': "static" }) SubElement(attributes, 'attribute', { 'id': '1', 'title': 'view_count', 'type': 'long' }) SubElement(attributes, 'attribute', { 'id': '2', 'title': 'cumulative_view_count', 'type': 'long' }) SubElement(attributes, 'attribute', { 'id': '3', 'title': 'subscriber_count', 'type': 'long' }) SubElement(attributes, 'attribute', { 'id': '4', 'title': 'cumulative_subscriber_count', 'type': 'long' }) SubElement(attributes, 'attribute', { 'id': '5', 'title': 'video_count', 'type': 'long' }) SubElement(attributes, 'attribute', { 'id': '6', 'title': 'cumulative_video_count', 'type': 'long' }) attributes = SubElement(graph, 'attributes', { 'class': "node", 'mode': "dynamic" }) SubElement(attributes, 'attribute', { 'id': '7', 'title': 'cluster', 'type': 'long' }) nodes = SubElement(graph, 'nodes') edges = SubElement(graph, 'edges') s3_filename = "youtube_graph_node/min_users={min_users}/" \ "timespan={timespan}/final_date={final_date}/nodes.csv.bz2".format( min_users=min_users, timespan=timespan, final_date=str(final_date)) s3.Bucket('internet-scholar').download_file(s3_filename, './nodes.csv.bz2') nodes_file = decompress(filename='./nodes.csv.bz2') with open(nodes_file, newline='', encoding="utf8") as csv_reader: reader = csv.DictReader(csv_reader) dict_attvalues = dict() for node_record in reader: node = SubElement( nodes, 'node', { 'id': node_record['channel_id'], 'label': node_record['channel_title'] }) dict_attvalues[node_record['channel_id']] = SubElement( node, 'attvalues') SubElement(dict_attvalues[node_record['channel_id']], 'attvalue', { 'for': '1', 'value': node_record['view_count'] }) SubElement(dict_attvalues[node_record['channel_id']], 'attvalue', { 'for': '2', 'value': node_record['cumulative_view_count'] }) SubElement(dict_attvalues[node_record['channel_id']], 'attvalue', { 'for': '3', 'value': node_record['subscriber_count'] }) SubElement(dict_attvalues[node_record['channel_id']], 'attvalue', { 'for': '4', 'value': node_record['cumulative_view_count'] }) SubElement(dict_attvalues[node_record['channel_id']], 'attvalue', { 'for': '5', 'value': node_record['video_count'] }) SubElement(dict_attvalues[node_record['channel_id']], 'attvalue', { 'for': '6', 'value': node_record['cumulative_video_count'] }) s3_filename = "youtube_graph_louvain/min_users={min_users}/" \ "timespan={timespan}/final_date={final_date}/louvain.csv.bz2".format( min_users=min_users, timespan=timespan, final_date=str(final_date)) s3.Bucket('internet-scholar').download_file(s3_filename, './louvain.csv.bz2') louvain_file = decompress(filename='./louvain.csv.bz2') with open(louvain_file, newline='', encoding="utf8") as csv_reader: reader = csv.DictReader(csv_reader) for louvain_record in reader: SubElement( dict_attvalues[louvain_record['channel_id']], 'attvalue', { 'for': '7', 'value': louvain_record['cluster'], 'timestamp': louvain_record['resolution'] }) s3_filename = "youtube_graph_edge/min_users={min_users}/" \ "timespan={timespan}/final_date={final_date}/edges.csv.bz2".format( min_users=min_users, timespan=timespan, final_date=str(final_date)) s3.Bucket('internet-scholar').download_file(s3_filename, './edges.csv.bz2') edges_file = decompress(filename='./edges.csv.bz2') with open(edges_file, newline='', encoding="utf8") as csv_reader: reader = csv.DictReader(csv_reader) for edge_record in reader: SubElement( edges, 'edge', { 'source': edge_record['source_id'], 'target': edge_record['target_id'], 'Weight': edge_record['Weight'] }) f = open('./network.gexf', 'wb') f.write(prettify_xml(gexf)) f.close() compressed_gexf = compress(filename='./network.gexf') s3_filename = "youtube_graph_gexf/min_users={min_users}/" \ "timespan={timespan}/final_date={final_date}/network.gexf.bz2".format(min_users=min_users, timespan=timespan, final_date=str(final_date)) s3.Bucket('internet-scholar').upload_file(str(compressed_gexf), s3_filename) final_date = final_date + timedelta(days=1)
def create_louvain(min_users, timespan, final_date, end): s3 = boto3.resource('s3') athena_db = AthenaDatabase(database='internet_scholar', s3_output='internet-scholar-admin') while final_date <= end: print('Louvain - {}'.format(str(final_date))) edges = athena_db.query_athena_and_download( query_string=EDGES_LOUVAIN.format(final_date=str(final_date), min_users=min_users, timespan=timespan), filename='edges_louvain.csv') g = nx.Graph() with open(edges, newline='', encoding="utf8") as csv_reader: reader = csv.DictReader(csv_reader) for edge in reader: g.add_edge(edge['source_id'], edge['target_id'], weight=int(edge['weight'])) with open('./louvain.csv', 'w', encoding="utf8") as csv_writer: writer = csv.DictWriter(csv_writer, fieldnames=[ 'resolution', 'channel_id', 'cluster', 'graph_size', 'cluster_size', 'cluster_count' ], dialect='unix') writer.writeheader() nodes = list(g) graph_size = len(nodes) for resolution in numpy.arange(10, 0, -0.1): partition = community.best_partition(g, resolution=resolution, randomize=False) cluster_count = len(set(partition.values())) for partition_number in set(partition.values()): new_partition = list() for channel_id in partition.keys(): if partition[channel_id] == partition_number: new_partition.append(channel_id) cluster_size = len(new_partition) new_partition_number = nodes.index(min(new_partition)) for item in new_partition: new_record = dict() new_record['resolution'] = "{:.1f}".format(resolution) new_record['channel_id'] = item new_record['cluster'] = new_partition_number new_record['graph_size'] = graph_size new_record['cluster_size'] = cluster_size new_record['cluster_count'] = cluster_count writer.writerow(new_record) compressed_file = compress(filename='./louvain.csv', delete_original=True) s3_filename = "youtube_graph_louvain/min_users={min_users}/" \ "timespan={timespan}/final_date={final_date}/louvain.csv.bz2".format( min_users=min_users, timespan=timespan, final_date=str(final_date)) s3.Bucket('internet-scholar').upload_file(str(compressed_file), s3_filename) final_date = final_date + timedelta(days=1) athena_db.query_athena_and_wait( query_string='drop table if exists youtube_graph_louvain') athena_db.query_athena_and_wait( query_string=CREATE_YOUTUBE_GRAPH_LOUVAIN.format( s3_data='internet-scholar')) athena_db.query_athena_and_wait( query_string='MSCK REPAIR TABLE youtube_graph_louvain')
def collect_video_snippets(self): logging.info("Start collecting video snippets") athena = AthenaDatabase(database=self.athena_data, s3_output=self.s3_admin) if not athena.table_exists("youtube_video_snippet"): select_twitter_stream_video = SELECT_TWITTER_STREAM_VIDEO select_youtube_related_video = SELECT_YOUTUBE_RELATED_VIDEO else: logging.info("Table youtube_video_snippet exists") select_twitter_stream_video = SELECT_TWITTER_STREAM_VIDEO + EXTRA_TWITTER_STREAM_VIDEO select_youtube_related_video = SELECT_YOUTUBE_RELATED_VIDEO + EXTRA_YOUTUBE_RELATED_VIDEO queries = [select_twitter_stream_video] if athena.table_exists("youtube_related_video"): queries.append(select_youtube_related_video) query = " union all ".join(queries) query_count = SELECT_COUNT.format(query) query_group_by = SELECT_GROUP_BY.format(query) logging.info("Download IDs for all Youtube videos that have not been processed yet") video_count = int(athena.query_athena_and_get_result(query_string=query_count)['video_count']) logging.info("There are %d links to be processed: download them", video_count) video_ids_csv = athena.query_athena_and_download(query_string=query_group_by, filename="video_ids.csv") output_json = Path(Path(__file__).parent, 'tmp', 'youtube_video_snippet.json') Path(output_json).parent.mkdir(parents=True, exist_ok=True) current_key = 0 try: youtube = googleapiclient.discovery.build(serviceName="youtube", version="v3", developerKey= self.credentials[current_key]['developer_key'], cache_discovery=False) except UnknownApiNameOrVersion as e: service = read_dict_from_url(url="https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest") youtube = googleapiclient.discovery.build_from_document(service=service, developerKey=self.credentials[current_key][ 'developer_key']) with open(video_ids_csv, newline='') as csv_reader: with open(output_json, 'w') as json_writer: reader = csv.DictReader(csv_reader) num_videos = 0 for video_id in reader: if num_videos % self.LOGGING_INTERVAL == 0: logging.info("%d out of %d videos processed", num_videos, video_count) num_videos = num_videos + 1 connection_reset_by_peer = 0 service_unavailable = 0 no_response = True response = dict() while no_response: try: response = youtube.videos().list(part="snippet",id=video_id['video_id']).execute() no_response = False except SocketError as e: if e.errno != errno.ECONNRESET: logging.info("Other socket error!") raise else: connection_reset_by_peer = connection_reset_by_peer + 1 logging.info("Connection reset by peer! {}".format(connection_reset_by_peer)) if connection_reset_by_peer <= 10: time.sleep(self.WAIT_WHEN_CONNECTION_RESET_BY_PEER) try: youtube = googleapiclient.discovery.build(serviceName="youtube", version="v3", developerKey= self.credentials[current_key][ 'developer_key'], cache_discovery=False) except UnknownApiNameOrVersion as e: service = read_dict_from_url( url="https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest") youtube = googleapiclient.discovery.build_from_document(service=service, developerKey= self.credentials[ current_key][ 'developer_key']) else: raise except HttpError as e: if "403" in str(e): logging.info("Invalid {} developer key: {}".format( current_key, self.credentials[current_key]['developer_key'])) current_key = current_key + 1 if current_key >= len(self.credentials): raise else: try: youtube = googleapiclient.discovery.build(serviceName="youtube", version="v3", developerKey= self.credentials[current_key][ 'developer_key'], cache_discovery=False) except UnknownApiNameOrVersion as e: service = read_dict_from_url( url="https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest") youtube = googleapiclient.discovery.build_from_document(service=service, developerKey= self.credentials[ current_key][ 'developer_key']) elif "503" in str(e): logging.info("Service unavailable") service_unavailable = service_unavailable + 1 if service_unavailable <= 10: time.sleep(self.WAIT_WHEN_SERVICE_UNAVAILABLE) else: raise else: raise if len(response.get('items', [])) == 0: response['id'] = video_id['video_id'] response['retrieved_at'] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] response['description'] = "Video unavailable. It has probably been removed by the user." json_writer.write("{}\n".format(json.dumps(response))) else: for item in response['items']: item['snippet']['publishedAt'] = item['snippet']['publishedAt'].rstrip('Z').replace('T', ' ') item['retrieved_at'] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] json_writer.write("{}\n".format(json.dumps(item))) logging.info("Compress file %s", output_json) compressed_file = compress(filename=output_json, delete_original=True) s3 = boto3.resource('s3') s3_filename = "youtube_video_snippet/creation_date={}/{}-{}.json.bz2".format(datetime.utcnow().strftime("%Y-%m-%d"), uuid.uuid4().hex, num_videos) logging.info("Upload file %s to bucket %s at %s", compressed_file, self.s3_data, s3_filename) s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename) logging.info("Recreate table for Youtube channel stats") athena.query_athena_and_wait(query_string="DROP TABLE IF EXISTS youtube_video_snippet") athena.query_athena_and_wait(query_string=CREATE_VIDEO_SNIPPET_JSON.format(s3_bucket=self.s3_data)) athena.query_athena_and_wait(query_string="MSCK REPAIR TABLE youtube_video_snippet") logging.info("Concluded collecting video snippets")
def collect_related_video(self, region_code, creation_date=None): athena_db = AthenaDatabase(database=self.athena_data, s3_output=self.s3_admin) trending_filename = Path(Path(__file__).parent, 'tmp', 'trending.csv') Path(trending_filename).parent.mkdir(parents=True, exist_ok=True) if creation_date is None: query_string = TRENDING_VIDEOS.format( creation_date=( date.today() - timedelta(days=self.NUMBER_OF_DAYS)).strftime("%Y-%m-%d"), number_of_videos=self.NUMBER_OF_VIDEOS) else: query_string = TRENDING_VIDEOS.format( creation_date=( datetime.strptime(creation_date, '%Y-%m-%d') - timedelta(days=self.NUMBER_OF_DAYS)).strftime("%Y-%m-%d"), number_of_videos=self.NUMBER_OF_VIDEOS) trending_videos = athena_db.query_athena_and_download( query_string=query_string, filename=trending_filename) with open(trending_videos, newline='', encoding="utf8") as csv_reader: output_json = Path( Path(__file__).parent, 'tmp', 'youtube_related_video.json') Path(output_json).parent.mkdir(parents=True, exist_ok=True) with open(output_json, 'w') as json_writer: reader = csv.DictReader(csv_reader) current_key = 0 try: youtube = googleapiclient.discovery.build( serviceName="youtube", version="v3", developerKey=self.credentials[current_key] ['developer_key'], cache_discovery=False) except UnknownApiNameOrVersion as e: service = read_dict_from_url( url= "https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest" ) youtube = googleapiclient.discovery.build_from_document( service=service, developerKey=self.credentials[current_key] ['developer_key']) num_videos = 0 if creation_date is None: max_results = self.NUMBER_OF_RELATED_VIDEOS part = 'id' else: part = 'snippet' max_results = self.NUMBER_OF_RELATED_VIDEOS * 3 if max_results > 50: max_results = 50 for trending_video in reader: service_unavailable = 0 connection_reset_by_peer = 0 no_response = True response = dict() while no_response: try: response = youtube.search().list( part=part, type='video', regionCode=region_code, relatedToVideoId=trending_video['id'], maxResults=max_results).execute() no_response = False except SocketError as e: if e.errno != errno.ECONNRESET: logging.info("Other socket error!") raise else: connection_reset_by_peer = connection_reset_by_peer + 1 logging.info( "Connection reset by peer! {}".format( connection_reset_by_peer)) if connection_reset_by_peer <= 10: time.sleep( self.WAIT_WHEN_CONNECTION_RESET_BY_PEER ) try: youtube = googleapiclient.discovery.build( serviceName="youtube", version="v3", developerKey=self.credentials[ current_key]['developer_key'], cache_discovery=False) except UnknownApiNameOrVersion as e: service = read_dict_from_url( url= "https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest" ) youtube = googleapiclient.discovery.build_from_document( service=service, developerKey=self.credentials[ current_key]['developer_key']) else: raise except HttpError as e: if "403" in str(e): logging.info( "Invalid {} developer key: {}".format( current_key, self.credentials[current_key] ['developer_key'])) current_key = current_key + 1 if current_key >= len(self.credentials): raise else: try: youtube = googleapiclient.discovery.build( serviceName="youtube", version="v3", developerKey=self.credentials[ current_key]['developer_key'], cache_discovery=False) except UnknownApiNameOrVersion as e: service = read_dict_from_url( url= "https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest" ) youtube = googleapiclient.discovery.build_from_document( service=service, developerKey=self.credentials[ current_key]['developer_key']) elif "Backend Error" in str(e): # Backend errors are usually associated to getting # recommended videos for a video that was deleted by the user. # In that case, just move on. logging.info( "Backend error. Video %s will be ignored", trending_video['id']) no_response = False elif "Not Found" in str(e): # Backend errors are usually associated to getting # recommended videos for a video that was deleted by the user. # In that case, just move on. logging.info( "Not Found error. Video %s will be ignored", trending_video['id']) no_response = False elif "404" in str(e): logging.info( "Requested entity was not found. Video %s will be ignored", trending_video['id']) no_response = False elif "400" in str(e): logging.info( "Invalid argument. Video %s will be ignored", trending_video['id']) no_response = False elif "503" in str(e): logging.info("Service unavailable") service_unavailable = service_unavailable + 1 if service_unavailable <= 10: time.sleep( self.WAIT_WHEN_SERVICE_UNAVAILABLE) else: raise else: raise rank = 1 for item in response.get('items', {}): item['relatedToVideoId'] = trending_video['id'] item['retrieved_at'] = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S.%f")[:-3] item['rank'] = rank if creation_date is None: rank = rank + 1 num_videos = num_videos + 1 json_writer.write("{}\n".format(json.dumps(item))) else: item['snippet']['publishedAt'] = item['snippet'][ 'publishedAt'].rstrip('Z').replace('T', ' ') if rank <= self.NUMBER_OF_RELATED_VIDEOS: if item['snippet'][ 'publishedAt'] <= creation_date + ' 00:00:00.000': rank = rank + 1 num_videos = num_videos + 1 json_writer.write("{}\n".format( json.dumps(item))) logging.info("Compress file %s", output_json) compressed_file = compress(filename=output_json, delete_original=True) s3 = boto3.resource('s3') if creation_date is None: s3_filename = "youtube_related_video/creation_date={creation_date}/{num_videos}.json.bz2".format( creation_date=datetime.utcnow().strftime("%Y-%m-%d"), num_videos=num_videos) else: s3_filename = "youtube_related_video/creation_date={creation_date}/{num_videos}.json.bz2".format( creation_date=creation_date, num_videos=num_videos) logging.info("Upload file %s to bucket %s at %s", compressed_file, self.s3_data, s3_filename) s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename) logging.info("Recreate table for Youtube related video snippets") athena_db.query_athena_and_wait( query_string="DROP TABLE IF EXISTS youtube_related_video") athena_db.query_athena_and_wait( query_string=CREATE_VIDEO_RELATED_JSON.format( s3_bucket=self.s3_data)) athena_db.query_athena_and_wait( query_string="MSCK REPAIR TABLE youtube_related_video") logging.info("Concluded collecting related video snippets")
def collect_channel_stats(self): logging.info("Start collecting Youtube channel stats") channel_ids = Path(Path(__file__).parent, 'tmp', 'channel_ids.csv') athena = AthenaDatabase(database=self.athena_data, s3_output=self.s3_admin) athena.query_athena_and_download(query_string=SELECT_DISTINCT_CHANNEL, filename=channel_ids) channel_count = int( athena.query_athena_and_get_result( query_string=SELECT_COUNT_DISTINCT_CHANNEL)['channel_count']) logging.info("There are %d channels to be processed: download them", channel_count) current_key = 0 youtube = googleapiclient.discovery.build( serviceName="youtube", version="v3", developerKey=self.credentials[current_key]['developer_key'], cache_discovery=False) with open(channel_ids, newline='') as csv_reader: output_json = Path( Path(__file__).parent, 'tmp', 'youtube_channel_stats.json') with open(output_json, 'w') as json_writer: reader = csv.DictReader(csv_reader) num_channels = 0 for channel_id in reader: if num_channels % self.LOGGING_INTERVAL == 0: logging.info("%d out of %d channels processed", num_channels, channel_count) num_channels = num_channels + 1 service_unavailable = 0 no_response = True while no_response: try: response = youtube.channels().list( part="statistics", id=channel_id['channel_id']).execute() no_response = False except HttpError as e: if "403" in str(e): logging.info( "Invalid {} developer key: {}".format( current_key, self.credentials[current_key] ['developer_key'])) current_key = current_key + 1 if current_key >= len(self.credentials): raise else: youtube = googleapiclient.discovery.build( serviceName="youtube", version="v3", developerKey=self.credentials[ current_key]['developer_key'], cache_discovery=False) elif "503" in str(e): logging.info("Service unavailable") service_unavailable = service_unavailable + 1 if service_unavailable <= 10: time.sleep( self.WAIT_WHEN_SERVICE_UNAVAILABLE) else: raise else: raise for item in response.get('items', []): item['retrieved_at'] = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S.%f")[:-3] json_writer.write("{}\n".format(json.dumps(item))) logging.info("Compress file %s", output_json) compressed_file = compress(filename=output_json, delete_original=True) s3 = boto3.resource('s3') s3_filename = "youtube_channel_stats/creation_date={}/{}.json.bz2".format( datetime.utcnow().strftime("%Y-%m-%d"), num_channels) logging.info("Upload file %s to bucket %s at %s", compressed_file, self.s3_data, s3_filename) s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename) logging.info("Recreate table for Youtube channel stats") athena.query_athena_and_wait( query_string="DROP TABLE IF EXISTS youtube_channel_stats") athena.query_athena_and_wait( query_string=CREATE_CHANNEL_STATS_JSON.format( s3_bucket=self.s3_data)) athena.query_athena_and_wait( query_string="MSCK REPAIR TABLE youtube_channel_stats") logging.info("Concluded collecting channel stats")
def collect_video_snippets(self): logging.info("Start collecting video snippets") athena = AthenaDatabase(database=self.athena_data, s3_output=self.s3_admin) query = SELECT_YOUTUBE_VIDEOS query_count = SELECT_COUNT_YOUTUBE_VIDEOS if athena.table_exists("youtube_video_snippet"): logging.info("Table youtube_video_snippet exists") query = query + TABLE_YOUTUBE_VIDEO_SNIPPET_EXISTS query_count = query_count + TABLE_YOUTUBE_VIDEO_SNIPPET_EXISTS logging.info( "Download IDs for all Youtube videos that have not been processed yet" ) video_count = int( athena.query_athena_and_get_result( query_string=query_count)['video_count']) logging.info("There are %d links to be processed: download them", video_count) video_ids_csv = athena.query_athena_and_download( query_string=query, filename="video_ids.csv") output_json = Path( Path(__file__).parent, 'tmp', 'youtube_video_snippet.json') Path(output_json).parent.mkdir(parents=True, exist_ok=True) current_key = 0 youtube = googleapiclient.discovery.build( serviceName="youtube", version="v3", developerKey=self.credentials[current_key]['developer_key'], cache_discovery=False) with open(video_ids_csv, newline='') as csv_reader: with open(output_json, 'w') as json_writer: reader = csv.DictReader(csv_reader) num_videos = 0 for video_id in reader: if num_videos % self.LOGGING_INTERVAL == 0: logging.info("%d out of %d videos processed", num_videos, video_count) num_videos = num_videos + 1 service_unavailable = 0 no_response = True while no_response: try: response = youtube.videos().list( part="snippet", id=video_id['video_id']).execute() no_response = False except HttpError as e: if "403" in str(e): logging.info( "Invalid {} developer key: {}".format( current_key, self.credentials[current_key] ['developer_key'])) current_key = current_key + 1 if current_key >= len(self.credentials): raise else: youtube = googleapiclient.discovery.build( serviceName="youtube", version="v3", developerKey=self.credentials[ current_key]['developer_key'], cache_discovery=False) elif "503" in str(e): logging.info("Service unavailable") service_unavailable = service_unavailable + 1 if service_unavailable <= 10: time.sleep( self.WAIT_WHEN_SERVICE_UNAVAILABLE) else: raise else: raise if len(response.get('items', [])) == 0: response['id'] = video_id['video_id'] response['retrieved_at'] = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S.%f")[:-3] response[ 'description'] = "Video unavailable. It has probably been removed by the user." json_writer.write("{}\n".format(json.dumps(response))) else: for item in response['items']: item['snippet']['publishedAt'] = item['snippet'][ 'publishedAt'].rstrip('Z').replace('T', ' ') item['retrieved_at'] = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S.%f")[:-3] json_writer.write("{}\n".format(json.dumps(item))) logging.info("Compress file %s", output_json) compressed_file = compress(filename=output_json, delete_original=True) s3 = boto3.resource('s3') s3_filename = "youtube_video_snippet/{}-{}.json.bz2".format( datetime.utcnow().strftime("%Y-%m-%d"), num_videos) logging.info("Upload file %s to bucket %s at %s", compressed_file, self.s3_data, s3_filename) s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename) logging.info("Concluded collecting video snippets") athena.query_athena_and_wait( query_string=CREATE_VIDEO_SNIPPET_JSON.format( s3_bucket=self.s3_data))
query = """ select twitter_stream.id_str as tweet_id, twitter_stream.user.id_str as user_id, url.expanded_url as url from internet_scholar.twitter_stream_raw as twitter_stream, unnest(entities.urls) as t(url) where creation_date = '{creation_date}' and url.display_url not like 'twitter.com/%' order by tweet_id, user_id, url; """ while current_date <= date(2019, 11, 27): print(str(current_date)) tweet_user_url = athena_db.query_athena_and_download( query_string=query.format(creation_date=str(current_date)), filename=str(current_date) + '.csv') compressed_file = compress(filename=tweet_user_url) s3 = boto3.resource('s3') s3_filename = "tweet_user_url/creation_date={creation_date}/{code}.csv.bz2".format( creation_date=str(current_date), code=uuid.uuid4().hex) s3.Bucket('internet-scholar').upload_file(str(compressed_file), s3_filename) current_date = current_date + timedelta(days=1)
def import_data(related_date, end_related_date, graph_date_difference, timespan): database = sqlite3.connect('./youtube_recommendations.sqlite') sqlite_aws = SqliteAWS(database=database, s3_admin='internet-scholar-admin', s3_data='internet-scholar', athena_db='internet_scholar') logging.info('Retrieve recommendations...') sqlite_aws.convert_athena_query_to_sqlite( table_name='recommendation_aux', query=RECOMMENDATION.format(begin_date=str(related_date), end_date=str(end_related_date))) logging.info('Add primary key to recommendation table...') database.execute(CREATE_TABLE_RECOMMENDATION) database.execute(INSERT_TABLE_RECOMMENDATION) database.execute('DROP TABLE recommendation_aux') logging.info('Update categories and null values...') database.execute(UPDATE_CATEGORY_SEED) database.execute(UPDATE_CATEGORY_RECOMMENDED) database.execute(UPDATE_NULL_SEED) database.execute(UPDATE_NULL_RECOMMENDED) logging.info('Retrieve Twitter users and YouTube channel data...') initial_date = related_date + timedelta( days=graph_date_difference) - timedelta(days=timespan - 1) final_date = end_related_date + timedelta(days=graph_date_difference) sqlite_aws.convert_athena_query_to_sqlite( table_name='twitter_user_channel', query=TWITTER_USER_CHANNEL.format(initial_date=str(initial_date), final_date=str(final_date))) logging.info('Calculate number of common Twitter users per channel...') database.execute(CREATE_YOUTUBE_CHANNEL_COMMON_TWITTER_USERS) current_date = related_date while current_date <= end_related_date: logging.info(str(current_date)) initial_date = current_date + timedelta( days=graph_date_difference) - timedelta(days=timespan - 1) final_date = current_date + timedelta(days=graph_date_difference) database.execute( INSERT_YOUTUBE_CHANNEL_COMMON_TWITTER_USERS.format( initial_date=initial_date, final_date=final_date)) current_date = current_date + timedelta(days=1) logging.info('Update aggregate on SQLite table 1...') database.execute( "ALTER TABLE recommendation ADD COLUMN seed_user_count INT") database.execute(UPDATE_SEED_USER_COUNT) logging.info('Update aggregate on SQLite table 2...') database.execute( "ALTER TABLE recommendation ADD COLUMN recommended_user_count INT") database.execute(UPDATE_RECOMMENDED_USER_COUNT) logging.info('Update aggregate on SQLite table 3...') database.execute( "ALTER TABLE recommendation ADD COLUMN common_user_count INT") database.execute(UPDATE_COMMON_USER_COUNT) logging.info('Retrieve info about political leaning...') sqlite_aws.convert_athena_query_to_sqlite( table_name='channel_political_leaning', query=SELECT_POLITICAL_LEANING.format( initial_date=str(related_date), final_date=str(end_related_date))) logging.info('Update political leaning info on SQLite 1...') database.execute( "ALTER TABLE recommendation ADD COLUMN seed_political_leaning TEXT") database.execute(UPDATE_SEED_POLITICAL_LEANING) logging.info('Update political leaning info on SQLite 1...') database.execute( "ALTER TABLE recommendation ADD COLUMN recommended_political_leaning TEXT" ) database.execute(UPDATE_RECOMMENDED_POLITICAL_LEANING) logging.info('Retrieve data on channel stats...') athena_db = AthenaDatabase(database='internet_scholar', s3_output='internet-scholar-admin') athena_db.query_athena_and_wait( query_string=CREATE_VIEW_ENHANCED_CHANNEL_STATS) sqlite_aws.convert_athena_query_to_sqlite( table_name='channel_stats', query=SELECT_ENHANCED_STATS.format(initial_date=str(related_date), final_date=str(end_related_date))) logging.info('Add primary key to channel stats...') database.execute(CREATE_CHANNEL_STATS_WITH_PRIMARY_KEY) database.execute(INSERT_CHANNEL_STATS_WITH_PRIMARY_KEY) add_stat_to_sqlite(database, field='view_count') add_stat_to_sqlite(database, field='cumulative_view_count') add_stat_to_sqlite(database, field='subscriber_count') add_stat_to_sqlite(database, field='cumulative_subscriber_count') add_stat_to_sqlite(database, field='video_count') add_stat_to_sqlite(database, field='cumulative_video_count') add_stat_to_sqlite(database, field='comment_count') add_stat_to_sqlite(database, field='cumulative_comment_count') database.execute('DROP TABLE channel_political_leaning') database.execute('DROP TABLE channel_stats') database.execute('DROP TABLE channel_stats_with_primary_key') database.execute('DROP TABLE twitter_user_channel') database.execute('DROP TABLE youtube_channel_common_twitter_users') database.commit() database.execute('VACUUM') database.close() new_filename = compress('./youtube_recommendations.sqlite') s3_filename = "youtube_data_export_r/{timestamp}.sqlite.bz2".format( timestamp=datetime.utcnow().strftime("%Y%m%d-%H%M%S")) s3 = boto3.resource('s3') s3.Bucket('internet-scholar').upload_file(str(new_filename), s3_filename)
def expand_urls(self, creation_date=None): logging.info("begin: expand URLs") athena = AthenaDatabase(database=self.athena_data, s3_output=self.s3_admin) yesterday = (date.today() - timedelta(days=1)).strftime("%Y-%m-%d") if creation_date is None: creation_date = yesterday logging.info("Expand URLs that were tweeted on {creation_date}".format( creation_date=creation_date)) query_tweet_user_url = self.__TWEET_USER_URL.format( creation_date=creation_date) query = self.__UNVALIDATED_URLS.format(creation_date=creation_date) query_count = self.__COUNT_UNVALIDATED_URLS.format( creation_date=creation_date) if athena.table_exists("validated_url"): logging.info("Table validated_url exists") query = query + " and url not in (select validated_url.url from validated_url)" query_count = query_count + " and url not in (select validated_url.url from validated_url)" logging.info('Update table tweet_user_url') tweet_user_url = athena.query_athena_and_download( query_string=query_tweet_user_url.format( creation_date=creation_date), filename=creation_date + '.csv') compressed_file = compress(filename=tweet_user_url) s3 = boto3.resource('s3') s3_filename = "tweet_user_url/creation_date={creation_date}/{code}.csv.bz2".format( creation_date=creation_date, code=uuid.uuid4().hex) logging.info('Upload data file that will comprise tweet_user_url') s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename) logging.info('Update table tweet_user_url on Athena') logging.info( "Create Athena table tweet_user_url if does not exist already") athena.query_athena_and_wait( query_string=self.__CREATE_TABLE_TWEET_USER_URL.format( s3_data=self.s3_data)) athena.query_athena_and_wait( query_string="MSCK REPAIR TABLE tweet_user_url") link_count = int( athena.query_athena_and_get_result( query_string=query_count)['link_count']) logging.info("There are %d links to be processed: download them", link_count) unvalidated_urls = athena.query_athena_and_download( query_string=query, filename="unvalidated_urls.csv") with open(unvalidated_urls, newline='') as csv_reader: validated_urls = Path( Path(__file__).parent, 'tmp', 'validated_urls.csv') Path(validated_urls).parent.mkdir(parents=True, exist_ok=True) logging.info("Create file %s for validated URLs", validated_urls) with open(str(validated_urls), 'w') as csv_writer: reader = csv.DictReader(csv_reader) writer = csv.DictWriter(csv_writer, fieldnames=[ 'url', 'validated_url', 'status_code', 'content_type', 'content_length', 'created_at' ], dialect='unix') url_expander = URLExpander() num_links = 0 for url in reader: if num_links % self.LOGGING_INTERVAL == 0: logging.info("%d out of %d links processed", num_links, link_count) num_links = num_links + 1 for expanded_url in url_expander.expand_url(url['url']): writer.writerow(expanded_url) logging.info("All links processed") logging.info("Compress file %s", validated_urls) compressed_file = compress(filename=validated_urls, delete_original=True) if creation_date == yesterday: filename_s3 = 'validated_url_raw/{}-{}.csv.bz2'.format( time.strftime('%Y-%m-%d-%H-%M-%S', time.gmtime()), link_count) else: filename_s3 = 'validated_url_raw/{}-{}.csv.bz2'.format( creation_date + '-23-59-59', link_count) logging.info("Upload file %s to bucket %s at %s", compressed_file, self.s3_data, filename_s3) s3.Bucket(self.s3_data).upload_file(str(compressed_file), filename_s3) logging.info( "Delete previous validated_url data: will be generated again") s3.Bucket( self.s3_data).objects.filter(Prefix="validated_url/").delete() logging.info( "Create Athena table validated_url_raw if does not exist already") athena.query_athena_and_wait( query_string=self.__CREATE_TABLE_VALIDATED_URL_RAW.format( s3_data=self.s3_data)) logging.info("Drop Athena table validated_url") athena.query_athena_and_wait( query_string="drop table if exists validated_url") logging.info("Creates Athena table validated_url through CTAS") athena.query_athena_and_wait( query_string=self.__CREATE_TABLE_VALIDATED_URL.format( s3_data=self.s3_data)) logging.info("END: expand URLs")