def analysis(): channels = read_channels('../output/final_data/P_channelData.json') channels_NP = read_channels('../output/final_data/NP_channelData.json') channels.update(channels_NP) subscriptions_with_zero = [ len(channel['subscriptions']) for channel in channels.values() ] subscriptions = [ len(channel['subscriptions']) for channel in channels.values() if len(channel['subscriptions']) != 0 ] print(np.mean(subscriptions_with_zero)) print(np.median(subscriptions_with_zero)) print(max(subscriptions_with_zero)) print(min(subscriptions_with_zero)) print(np.mean(subscriptions)) print(np.median(subscriptions)) print(max(subscriptions)) print(min(subscriptions)) print( len([ channel for channel in channels.values() if len(channel['cross']) ]))
def create_text_df(mode='captions', labeled=True, binary=True): if labeled: if binary: data = read_channels("../output/final_data/P_channelData.json") data.update( read_channels("../output/final_data/NP_channelData.json")) else: data = read_channels( "../output/final_data/P_channelData_multilabel.json") else: if binary: data = read_channels("../output/final_data/U_channelData.json") else: data = read_channels( "../output/final_data/U_channelData_multilabel.json") if mode == 'captions': row_dict_list = create_caption_data_dict(data, binary=binary) elif mode == 'comments': row_dict_list = create_comment_data_dict(data, binary=binary) else: # mode == 'snippets': row_dict_list = create_snippet_data_dict(data, binary=binary) data = pd.DataFrame(row_dict_list) # write_df(data,content=mode, binary=binary, labeled=labeled) return data
def create_merged_dataframe(binary: True): if binary: channels = read_channels('../output/final_data/P_channelData.json') channels_NP = read_channels('../output/final_data/NP_channelData.json') channels.update(channels_NP) channels_unlabeled = read_channels( '../output/final_data/U_channelData.json') else: channels = read_channels( '../output/final_data/P_channelData_multilabel.json') channels_unlabeled = read_channels( '../output/final_data/U_channelData_multilabel.json') channels.update(channels_unlabeled) all_channels = list(channels.keys()) cross_comments = {} for id, channel in tqdm(channels.items()): authors = {} for author in all_channels: authors[author] = 0 for video in channel['comments'].values(): if video is not None: for comment in video: if comment['authorChannelId'] is not None: author = comment['authorChannelId']['value'] if comment['authorChannelId']['value'] in all_channels: if author != channel['ChannelId']: authors[author] = authors[author] + 1 cross_comments[id] = {} cross_comments[id]['cross_comments'] = [ author for author in authors.keys() if authors[author] > 0 ] cross_comments[id]['SoftTags'] = channels[id]['SoftTags'] cross_comments[id]['ChannelId'] = channels[id]['ChannelId'] cross_comments[id]['ChannelTitle'] = channels[id]['ChannelTitle'] # channels[id]['comment_mentions'] = [{channel: mentions[channel]} for channel in mentions.keys() if mentions[channel]>0] print('dumping') if binary: output_path = "../output/final_data/cross_comments.json" else: output_path = "../output/final_data/cross_comments_multilabel.json" with open(output_path, "w") as f: json.dump(cross_comments, f, indent=4) print('ok')
def main(): os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1" load_dotenv() api_keys = os.getenv('API_KEYS').split(',') api_number = 0 api_scraper = RelationAPIScraper(api_keys, api_number) channelData = read_channels('../output/unlabeled_data/U_channelDataWithComments.json') for channel in tqdm(channelData.keys()): channelData[channel]['subscriptions'] = api_scraper.gather_info(channel, 'subscriptions') channelData[channel]['related_channels'] = api_scraper.gather_info(channel, 'affiliations') with open('../output/unlabeled_data/U_channelData.json', "w") as f: json.dump(channelData, f, indent=4)
def main(): os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1" load_dotenv() api_keys = os.getenv('API_KEYS').split(',') api_number = 0 api_scraper = CommentAPIScraper(api_keys, api_number) channelData = read_channels( '../output/unlabeled_data/U_channelDataWithCaptions.json') for channel in tqdm(channelData.keys()): videoIDs = [ video['VideoId'] for video in channelData[channel]['top3videos'] ] commentData = {} for videoId in videoIDs: commentData[videoId] = api_scraper.gather_info(videoId) channelData[channel]['comments'] = commentData with open("../output/unlabeled_data/U_channelDataWithComments.json", "w") as f: json.dump(channelData, f, indent=4)
def plot_channels(path_P, path_NP, path_U, merge=False, content='related_channels', only_known=False, directed=False, output='network_plot.html', line_width=1.0): channels = read_channels(path_P) if merge: channels_NP = read_channels(path_NP) channels.update(channels_NP) channels_U = read_channels(path_U) # channels.update(channels_U) network_data = pd.DataFrame({'source': [], 'target': []}) for channel, value in tqdm(channels.items()): for affiliate in value[content]: tmp = pd.DataFrame({'source': [channel], 'target': [affiliate]}) network_data = network_data.append(tmp) network_data['index'] = range(0, len(network_data)) network_data.set_index('index', inplace=True) # get basic information about unknown channels, but only if in related_channels mode, otherwise there are too many if content == 'related_channels': # get channel info via API # unknown_channels = [] # for index, row in network_data.iterrows(): # if row['target'] not in channels.keys(): # unknown_channels.append(row['target']) # this method uses api quota so it should be called rarely, instead, the file should be read from disk # related_channel_info = get_channel_snippets(unknown_channels) # or use previous execution of that function, that was written to a file related_channel_info = read_channels('related_channels.json') if only_known: network_data['target'] = network_data['target'].apply( lambda x: 'unknown' if x not in channels.keys() else x) network_data = network_data[network_data['target'] != 'unknown'] # for index, row in tqdm(network_data.iterrows()): # if row['target'] not in channels.keys(): # network_data.drop(index, inplace=True) di_G = nx.from_pandas_edgelist(network_data, create_using=nx.DiGraph()) G = nx.from_pandas_edgelist(network_data) coordinates = spring_layout( G, seed=42, iterations=50) # , iterations=1 ,k=1/sqrt(n) = 0.03 # coordinates = kamada_kawai_layout(G) # , iterations=1 ,k=1/sqrt(n) # coordinates = spectral_layout(G) edge_x = [] edge_y = [] x_0 = [] y_0 = [] x_1 = [] y_1 = [] for edge in di_G.edges(): x0, y0 = coordinates[edge[0]] x1, y1 = coordinates[edge[1]] edge_x.append(x0) edge_x.append(x1) edge_x.append(None) edge_y.append(y0) edge_y.append(y1) edge_y.append(None) x_0.append(x0) y_0.append(y0) x_1.append(x1) y_1.append(y1) node_df = pd.DataFrame({ 'ChannelId': [], 'Title': [], 'Subscribers': [], 'ChannelViews': [], '#related_channels': [], '#subscriptions': [], 'in_degree': [], 'x': [], 'y': [], 'label': [], 'labels': [], 'known': [] }) for node in G.nodes(): x, y = coordinates[node] tmp = pd.DataFrame({ 'ChannelId': [node], 'Title': [channels[node]['ChannelTitle']] if node in channels.keys() else [related_channel_info[node]['ChannelTitle']] if node in related_channel_info.keys() else ['Unknown'], 'Subscribers': [channels[node]['Subs']] if node in channels.keys() else [related_channel_info[node]['Subs']] if node in related_channel_info.keys() else ['Unknown'], 'ChannelViews': [channels[node]['ChannelViews']] if node in channels.keys() else [related_channel_info[node]['ChannelViews']] if node in related_channel_info.keys() else ['Unknown'], '#related_channels': [len(channels[node]['related_channels'])] if node in channels.keys() else [related_channel_info[node]['related_channels']] if node in related_channel_info.keys() else [0], '#subscriptions': [len(channels[node]['subscriptions'])] if node in channels.keys() else [0], 'in_degree': di_G.in_degree[node], 'x': [x], 'y': [y], # 'label': [str(channels[node]['SoftTags'][0])] if node in channels.keys() else ['Unknown'], 'label': [ 'Unknown' if node not in channels.keys() else 'Non-Political' if channels[node]['SoftTags'][0] == 'Non-Political' else 'Unlabeled' if channels[node] ['SoftTags'][0] == 'UNLABELED' else 'Political' ], 'labels': [determine_labels(channels[node]['SoftTags'])] if node in channels.keys() else ['Unknown'], 'known': str(node in channels.keys()) }) node_df = node_df.append(tmp) if directed: fig = px.scatter( node_df, x="x", y="y", color="labels", # symbol='known', hover_data=['Title', 'ChannelId', 'labels', '#subscriptions'], size='in_degree', color_discrete_sequence=[ "lightcoral", "red", "green", "lime", "orange", "cyan", "mediumslateblue", "blue", "magenta", "honeydew", "gray", "goldenrod", "teal", "yellow", "CornflowerBlue", "LightPink" ]).update_layout( dict(annotations=[ dict(ax=x_0[i], ay=y_0[i], axref='x', ayref='y', x=x_1[i], y=y_1[i], xref='x', yref='y', showarrow=True, arrowhead=2, arrowsize=1, arrowwidth=1) for i in range(0, len(x_0)) ])) else: fig = px.scatter( node_df, x="x", y="y", color="label", hover_data=['Title', 'ChannelId', 'labels'], # size='in_degree', color_discrete_sequence=[ "red", "green", "lightcoral", "lime", "black", "cyan", "#feafda", "blue", "magenta", "honeydew", "gray", "goldenrod", "darkmagenta", "yellow", "CornflowerBlue", "LightPink" ]) fig2 = go.Figure( data=go.Scatter(x=edge_x, y=edge_y, line=dict(width=line_width, color='#888'), hoverinfo='skip', mode='lines', showlegend=False)) figTotal = go.Figure() figTotal.add_trace(fig2['data'][0]) for trace in fig['data']: figTotal.add_trace(trace) # figTotal.add_trace(fig['data'][1]) # fig.add_scatter(x=edge_x, y=edge_y, # line=dict(width=line_width, color='#888'), # hoverinfo='skip', # mode='lines', # showlegend=False # ) figTotal.update_traces(marker=dict(size=10, line=dict(width=1, color='DarkSlateGrey')), selector=dict(mode='markers')) figTotal.update_xaxes(showgrid=False, zeroline=False, visible=False) figTotal.update_yaxes(showgrid=False, zeroline=False, visible=False) figTotal.show() figTotal.write_html(output)
import csv import json from utils.data_IO import read_channels if __name__ == '__main__': p_data = read_channels("../output/P_channelData.json") # np_data = read_channels("../output/NP_channelData.json") u_data = read_channels("../output/final_data/U_channelData.json") duplicate_channels = [] for key, value in u_data.items(): if key in p_data.keys(): print(value['ChannelTitle']) duplicate_channels.append(key) with open('duplicate_data.csv', 'w') as result_file: wr = csv.writer(result_file, dialect='excel') wr.writerow(duplicate_channels) for key in duplicate_channels: u_data.pop(key, None) with open("../output/final_data/U_channelData.json", "w") as f: json.dump(u_data, f, indent=4)
def get_connection_dataframe(content='related_channels', only_known=True, labeled=True, binary=True): start_time = time.time() if content == 'cross_comments': if binary: channels = read_channels( '../output/final_data/cross_comments.json') else: channels = read_channels( '../output/final_data/cross_comments_multilabel.json') else: if binary: channels = read_channels('../output/final_data/P_channelData.json') channels_NP = read_channels( '../output/final_data/NP_channelData.json') channels.update(channels_NP) channels_unlabeled = read_channels( '../output/final_data/U_channelData.json') else: channels = read_channels( '../output/final_data/P_channelData_multilabel.json') channels_unlabeled = read_channels( "../output/final_data/U_channelData_multilabel.json") channels.update(channels_unlabeled) labeled_index = len([ channel for channel in channels.values() if channel['SoftTags'][0] != 'UNLABELED' ]) all_channels = list(channels.keys()) connections_df = pd.DataFrame({ 'ChannelId': [], 'ChannelTitle': [], 'label': [], 'connections': [] }) # subs = [] for key, value in channels.items(): if only_known: tmp = pd.DataFrame({ 'ChannelId': [key], 'ChannelTitle': [value['ChannelTitle']], 'label': [determine_label(labels=value['SoftTags'], binary=binary)], 'connections': [[ affiliate for affiliate in value[content] if affiliate in all_channels ]] }) else: tmp = pd.DataFrame({ 'ChannelId': [key], 'ChannelTitle': [value['ChannelTitle']], 'label': [determine_label(labels=value['SoftTags'], binary=binary)], 'connections': [value[content]] }) if len(tmp['connections'][0]) == 0: tmp.at[0, 'connections'] = ['None'] # else: # subs.append(len(tmp['connections'][0])) connections_df = connections_df.append(tmp) # print('Mean relations: %f | Median Relations: %f' % (np.mean(subs), np.median(subs))) tmp = connections_df['connections'].apply(Counter) tmp = tmp.reset_index().drop(columns='index') encoded_connections_df = pd.DataFrame.from_records( tmp['connections']).fillna(value=0).drop(columns='None') encoded_connections_df['label'] = list(connections_df['label']) encoded_connections_df['id'] = list(connections_df['ChannelId']) encoded_connections_df['title'] = list(connections_df['ChannelTitle']) # encoded_connections_df.reset_index().drop(columns='index') end_time = time.time() - start_time print(f"finishing: {content} after {end_time} second") if labeled: if binary: # We want political and non political data in the dataset # return all labeled channels write_df(encoded_connections_df.iloc[:labeled_index], content=content, binary=binary, labeled=labeled) return encoded_connections_df.iloc[:labeled_index] elif content == 'cross_comments' and not labeled: # We only want the political data # return all labeled political channels # unfortunately there is no nice way to index non-scalar values for index, row in encoded_connections_df.iterrows(): if row['label'] == ['UNLABELED']: write_df(encoded_connections_df.iloc[:index], content=content, binary=binary, labeled=labeled) return encoded_connections_df.iloc[:index] else: # in this case the whole dataset only contains political and unlabeled channels write_df(encoded_connections_df.iloc[:labeled_index], content=content, binary=binary, labeled=labeled) return encoded_connections_df.iloc[:labeled_index] else: write_df( encoded_connections_df.iloc[labeled_index:].reset_index().drop( columns='index'), content=content, binary=binary, labeled=labeled) return encoded_connections_df.iloc[labeled_index:].reset_index().drop( columns='index')
subs = [int(channel['Subs']) for channel in NP_data.values()] subs.append(1) mean_subs = np.mean(subs) median = np.median(subs) std = np.std(subs) print(mean_subs) print(median) print(std) print(np.max(subs)) print(np.min(subs)) # x = 10 ** np.random.uniform(size=1000) plt.hist(subs, 50) # plt.hist(x, bins=10 ** np.linspace(0, 1, 10)) min_ylim, max_ylim = plt.ylim() plt.axvline(mean_subs, color='k', linestyle='dashed', linewidth=1) plt.text(mean_subs * 1.1, max_ylim * 0.8, 'Mean: {:.2f}'.format(mean_subs)) plt.axvline(median, color='g', linestyle='dashed', linewidth=1) plt.text(median * 0.58, max_ylim * 0.9, 'Median: {:}'.format(int(median))) plt.xscale('log') # plt.xlim(left, right) plt.xlabel('Number of Subscribers', size=13) plt.ylabel('Number of Channels', size=13) plt.show() if __name__ == "__main__": P_data = read_channels('../output/final_data/P_channelData.json') NP_data = read_channels('../output/final_data/NP_channelData.json') plotSubscriptions(P_data, NP_data)
developerKey=self.api_keys[self.api_number + 1]) self.api_number += 1 # try again return self.gather_info(videoId) else: raise http_error if __name__ == "__main__": os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1" load_dotenv() api_keys = os.getenv('API_KEYS').split(',') api_number = 0 api_scraper = VideoSnippetAPIScraper(api_keys, api_number) channelData = read_channels( '../output/unlabeled_data/U_channelDataWithScrapedTopVideos.json') for channel in tqdm(channelData.keys()): videoIDs = [ video for video in channelData[channel]['top3videos'] if not None ] video_snippets = [] for videoId in videoIDs: video_snippets.append(api_scraper.gather_info(videoId)) channelData[channel]['top3videos'] = video_snippets with open("../output/unlabeled_data/U_channelDataWithTopVideos.json", "w") as f: json.dump(channelData, f, indent=4)
import csv import json from tqdm import tqdm from utils.data_IO import read_channels def remove_NP_data(data): with open("unlabeled_data_predictions.csv", 'r') as f: # opening file in binary(rb) mode reader = csv.reader(f, delimiter=';', quotechar='|') next(reader, None) for row in tqdm(reader): if row[9] == '__label__non-political': data.pop(row[0], None) return data if __name__ == '__main__': unlabeled_channels = read_channels( "../output/final_data/U_channelData.json") remove_NP_data(unlabeled_channels) with open("../output/final_data/U_channelData_multilabel.json", "w") as f: json.dump(unlabeled_channels, f, indent=4)
channel['ChannelTitle'] = preprocess(channel['ChannelTitle']) channel['Description'] = preprocess(channel['Description']) for video in channel['top3videos']: video['title'] = preprocess(video['title']) video['description'] = preprocess(video['description']) if 'tags' in video.keys(): for index, tag in enumerate(video['tags']): video['tags'][index] = preprocess(tag) if __name__ == '__main__': # p_data = read_channels("../output/P_channelData.json") # np_data = read_channels("../output/NP_channelData.json") # u_data = read_channels("../output/unlabeled_data/U_channelData.json") p_ml_data = read_channels("../output/P_channelData_multi_label.json") # # preprocess_dataset(p_data) # with open('../output/final_data/P_channelData.json', "w") as f: # json.dump(p_data, f, indent=4) # # preprocess_dataset(np_data) # with open('../output/final_data/NP_channelData.json', "w") as f: # json.dump(np_data, f, indent=4) # # preprocess_dataset(u_data) # with open('../output/final_data/U_channelData.json', "w") as f: # json.dump(u_data, f, indent=4) preprocess_dataset(p_ml_data)