예제 #1
0
def analysis():
    channels = read_channels('../output/final_data/P_channelData.json')
    channels_NP = read_channels('../output/final_data/NP_channelData.json')
    channels.update(channels_NP)
    subscriptions_with_zero = [
        len(channel['subscriptions']) for channel in channels.values()
    ]
    subscriptions = [
        len(channel['subscriptions']) for channel in channels.values()
        if len(channel['subscriptions']) != 0
    ]

    print(np.mean(subscriptions_with_zero))
    print(np.median(subscriptions_with_zero))
    print(max(subscriptions_with_zero))
    print(min(subscriptions_with_zero))

    print(np.mean(subscriptions))
    print(np.median(subscriptions))
    print(max(subscriptions))
    print(min(subscriptions))

    print(
        len([
            channel for channel in channels.values() if len(channel['cross'])
        ]))
예제 #2
0
def create_text_df(mode='captions', labeled=True, binary=True):
    if labeled:
        if binary:
            data = read_channels("../output/final_data/P_channelData.json")
            data.update(
                read_channels("../output/final_data/NP_channelData.json"))
        else:
            data = read_channels(
                "../output/final_data/P_channelData_multilabel.json")

    else:
        if binary:
            data = read_channels("../output/final_data/U_channelData.json")
        else:
            data = read_channels(
                "../output/final_data/U_channelData_multilabel.json")

    if mode == 'captions':
        row_dict_list = create_caption_data_dict(data, binary=binary)
    elif mode == 'comments':
        row_dict_list = create_comment_data_dict(data, binary=binary)
    else:  # mode == 'snippets':
        row_dict_list = create_snippet_data_dict(data, binary=binary)
    data = pd.DataFrame(row_dict_list)
    # write_df(data,content=mode, binary=binary, labeled=labeled)
    return data
def create_merged_dataframe(binary: True):
    if binary:
        channels = read_channels('../output/final_data/P_channelData.json')
        channels_NP = read_channels('../output/final_data/NP_channelData.json')
        channels.update(channels_NP)
        channels_unlabeled = read_channels(
            '../output/final_data/U_channelData.json')

    else:

        channels = read_channels(
            '../output/final_data/P_channelData_multilabel.json')
        channels_unlabeled = read_channels(
            '../output/final_data/U_channelData_multilabel.json')
    channels.update(channels_unlabeled)
    all_channels = list(channels.keys())
    cross_comments = {}
    for id, channel in tqdm(channels.items()):
        authors = {}
        for author in all_channels:
            authors[author] = 0
        for video in channel['comments'].values():
            if video is not None:
                for comment in video:
                    if comment['authorChannelId'] is not None:
                        author = comment['authorChannelId']['value']
                        if comment['authorChannelId']['value'] in all_channels:
                            if author != channel['ChannelId']:
                                authors[author] = authors[author] + 1
        cross_comments[id] = {}
        cross_comments[id]['cross_comments'] = [
            author for author in authors.keys() if authors[author] > 0
        ]
        cross_comments[id]['SoftTags'] = channels[id]['SoftTags']
        cross_comments[id]['ChannelId'] = channels[id]['ChannelId']
        cross_comments[id]['ChannelTitle'] = channels[id]['ChannelTitle']

        # channels[id]['comment_mentions'] = [{channel: mentions[channel]} for channel in mentions.keys() if mentions[channel]>0]
    print('dumping')
    if binary:
        output_path = "../output/final_data/cross_comments.json"
    else:
        output_path = "../output/final_data/cross_comments_multilabel.json"
    with open(output_path, "w") as f:
        json.dump(cross_comments, f, indent=4)
    print('ok')
def main():
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
    load_dotenv()
    api_keys = os.getenv('API_KEYS').split(',')
    api_number = 0

    api_scraper = RelationAPIScraper(api_keys, api_number)

    channelData = read_channels('../output/unlabeled_data/U_channelDataWithComments.json')
    for channel in tqdm(channelData.keys()):
        channelData[channel]['subscriptions'] = api_scraper.gather_info(channel, 'subscriptions')
        channelData[channel]['related_channels'] = api_scraper.gather_info(channel, 'affiliations')

    with open('../output/unlabeled_data/U_channelData.json', "w") as f:
        json.dump(channelData, f, indent=4)
예제 #5
0
def main():
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
    load_dotenv()
    api_keys = os.getenv('API_KEYS').split(',')
    api_number = 0

    api_scraper = CommentAPIScraper(api_keys, api_number)

    channelData = read_channels(
        '../output/unlabeled_data/U_channelDataWithCaptions.json')
    for channel in tqdm(channelData.keys()):
        videoIDs = [
            video['VideoId'] for video in channelData[channel]['top3videos']
        ]
        commentData = {}
        for videoId in videoIDs:
            commentData[videoId] = api_scraper.gather_info(videoId)
        channelData[channel]['comments'] = commentData

    with open("../output/unlabeled_data/U_channelDataWithComments.json",
              "w") as f:
        json.dump(channelData, f, indent=4)
예제 #6
0
def plot_channels(path_P,
                  path_NP,
                  path_U,
                  merge=False,
                  content='related_channels',
                  only_known=False,
                  directed=False,
                  output='network_plot.html',
                  line_width=1.0):
    channels = read_channels(path_P)
    if merge:
        channels_NP = read_channels(path_NP)
        channels.update(channels_NP)
        channels_U = read_channels(path_U)
        # channels.update(channels_U)
    network_data = pd.DataFrame({'source': [], 'target': []})
    for channel, value in tqdm(channels.items()):
        for affiliate in value[content]:
            tmp = pd.DataFrame({'source': [channel], 'target': [affiliate]})
            network_data = network_data.append(tmp)

    network_data['index'] = range(0, len(network_data))
    network_data.set_index('index', inplace=True)

    # get basic information about unknown channels, but only if in related_channels mode, otherwise there are too many
    if content == 'related_channels':
        # get channel info via API
        # unknown_channels = []
        # for index, row in network_data.iterrows():
        #     if row['target'] not in channels.keys():
        #         unknown_channels.append(row['target'])
        # this method uses api quota so it should be called rarely, instead, the file should be read from disk
        # related_channel_info = get_channel_snippets(unknown_channels)

        # or use previous execution of that function, that was written to a file
        related_channel_info = read_channels('related_channels.json')

    if only_known:
        network_data['target'] = network_data['target'].apply(
            lambda x: 'unknown' if x not in channels.keys() else x)
        network_data = network_data[network_data['target'] != 'unknown']
        # for index, row in tqdm(network_data.iterrows()):
        #     if row['target'] not in channels.keys():
        #         network_data.drop(index, inplace=True)

    di_G = nx.from_pandas_edgelist(network_data, create_using=nx.DiGraph())
    G = nx.from_pandas_edgelist(network_data)
    coordinates = spring_layout(
        G, seed=42, iterations=50)  # , iterations=1 ,k=1/sqrt(n) = 0.03
    # coordinates = kamada_kawai_layout(G)  # , iterations=1 ,k=1/sqrt(n)
    # coordinates = spectral_layout(G)
    edge_x = []
    edge_y = []
    x_0 = []
    y_0 = []
    x_1 = []
    y_1 = []
    for edge in di_G.edges():
        x0, y0 = coordinates[edge[0]]
        x1, y1 = coordinates[edge[1]]
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)
        x_0.append(x0)
        y_0.append(y0)
        x_1.append(x1)
        y_1.append(y1)

    node_df = pd.DataFrame({
        'ChannelId': [],
        'Title': [],
        'Subscribers': [],
        'ChannelViews': [],
        '#related_channels': [],
        '#subscriptions': [],
        'in_degree': [],
        'x': [],
        'y': [],
        'label': [],
        'labels': [],
        'known': []
    })

    for node in G.nodes():
        x, y = coordinates[node]
        tmp = pd.DataFrame({
            'ChannelId': [node],
            'Title':
            [channels[node]['ChannelTitle']] if node in channels.keys() else
            [related_channel_info[node]['ChannelTitle']]
            if node in related_channel_info.keys() else ['Unknown'],
            'Subscribers': [channels[node]['Subs']] if node in channels.keys()
            else [related_channel_info[node]['Subs']]
            if node in related_channel_info.keys() else ['Unknown'],
            'ChannelViews':
            [channels[node]['ChannelViews']] if node in channels.keys() else
            [related_channel_info[node]['ChannelViews']]
            if node in related_channel_info.keys() else ['Unknown'],
            '#related_channels': [len(channels[node]['related_channels'])]
            if node in channels.keys() else
            [related_channel_info[node]['related_channels']]
            if node in related_channel_info.keys() else [0],
            '#subscriptions': [len(channels[node]['subscriptions'])]
            if node in channels.keys() else [0],
            'in_degree':
            di_G.in_degree[node],
            'x': [x],
            'y': [y],
            # 'label': [str(channels[node]['SoftTags'][0])] if node in channels.keys() else ['Unknown'],
            'label': [
                'Unknown' if node not in channels.keys() else
                'Non-Political' if channels[node]['SoftTags'][0]
                == 'Non-Political' else 'Unlabeled' if channels[node]
                ['SoftTags'][0] == 'UNLABELED' else 'Political'
            ],
            'labels': [determine_labels(channels[node]['SoftTags'])]
            if node in channels.keys() else ['Unknown'],
            'known':
            str(node in channels.keys())
        })
        node_df = node_df.append(tmp)

    if directed:
        fig = px.scatter(
            node_df,
            x="x",
            y="y",
            color="labels",  # symbol='known',
            hover_data=['Title', 'ChannelId', 'labels', '#subscriptions'],
            size='in_degree',
            color_discrete_sequence=[
                "lightcoral", "red", "green", "lime", "orange", "cyan",
                "mediumslateblue", "blue", "magenta", "honeydew", "gray",
                "goldenrod", "teal", "yellow", "CornflowerBlue", "LightPink"
            ]).update_layout(
                dict(annotations=[
                    dict(ax=x_0[i],
                         ay=y_0[i],
                         axref='x',
                         ayref='y',
                         x=x_1[i],
                         y=y_1[i],
                         xref='x',
                         yref='y',
                         showarrow=True,
                         arrowhead=2,
                         arrowsize=1,
                         arrowwidth=1) for i in range(0, len(x_0))
                ]))
    else:
        fig = px.scatter(
            node_df,
            x="x",
            y="y",
            color="label",
            hover_data=['Title', 'ChannelId', 'labels'],  # size='in_degree',
            color_discrete_sequence=[
                "red", "green", "lightcoral", "lime", "black", "cyan",
                "#feafda", "blue", "magenta", "honeydew", "gray", "goldenrod",
                "darkmagenta", "yellow", "CornflowerBlue", "LightPink"
            ])

        fig2 = go.Figure(
            data=go.Scatter(x=edge_x,
                            y=edge_y,
                            line=dict(width=line_width, color='#888'),
                            hoverinfo='skip',
                            mode='lines',
                            showlegend=False))

        figTotal = go.Figure()
        figTotal.add_trace(fig2['data'][0])
        for trace in fig['data']:
            figTotal.add_trace(trace)
        # figTotal.add_trace(fig['data'][1])

        # fig.add_scatter(x=edge_x, y=edge_y,
        #                         line=dict(width=line_width, color='#888'),
        #                         hoverinfo='skip',
        #                         mode='lines',
        #                         showlegend=False
        #                         )

    figTotal.update_traces(marker=dict(size=10,
                                       line=dict(width=1,
                                                 color='DarkSlateGrey')),
                           selector=dict(mode='markers'))

    figTotal.update_xaxes(showgrid=False, zeroline=False, visible=False)
    figTotal.update_yaxes(showgrid=False, zeroline=False, visible=False)

    figTotal.show()
    figTotal.write_html(output)
예제 #7
0
import csv
import json

from utils.data_IO import read_channels

if __name__ == '__main__':
    p_data = read_channels("../output/P_channelData.json")
    # np_data = read_channels("../output/NP_channelData.json")
    u_data = read_channels("../output/final_data/U_channelData.json")
    duplicate_channels = []
    for key, value in u_data.items():
        if key in p_data.keys():
            print(value['ChannelTitle'])
            duplicate_channels.append(key)
    with open('duplicate_data.csv', 'w') as result_file:
        wr = csv.writer(result_file, dialect='excel')
        wr.writerow(duplicate_channels)
    for key in duplicate_channels:
        u_data.pop(key, None)
    with open("../output/final_data/U_channelData.json", "w") as f:
        json.dump(u_data, f, indent=4)

def get_connection_dataframe(content='related_channels',
                             only_known=True,
                             labeled=True,
                             binary=True):
    start_time = time.time()
    if content == 'cross_comments':
        if binary:
            channels = read_channels(
                '../output/final_data/cross_comments.json')
        else:
            channels = read_channels(
                '../output/final_data/cross_comments_multilabel.json')
    else:
        if binary:
            channels = read_channels('../output/final_data/P_channelData.json')
            channels_NP = read_channels(
                '../output/final_data/NP_channelData.json')
            channels.update(channels_NP)
            channels_unlabeled = read_channels(
                '../output/final_data/U_channelData.json')
        else:
            channels = read_channels(
                '../output/final_data/P_channelData_multilabel.json')
            channels_unlabeled = read_channels(
                "../output/final_data/U_channelData_multilabel.json")
        channels.update(channels_unlabeled)
    labeled_index = len([
        channel for channel in channels.values()
        if channel['SoftTags'][0] != 'UNLABELED'
    ])
    all_channels = list(channels.keys())

    connections_df = pd.DataFrame({
        'ChannelId': [],
        'ChannelTitle': [],
        'label': [],
        'connections': []
    })

    # subs = []
    for key, value in channels.items():
        if only_known:
            tmp = pd.DataFrame({
                'ChannelId': [key],
                'ChannelTitle': [value['ChannelTitle']],
                'label':
                [determine_label(labels=value['SoftTags'], binary=binary)],
                'connections': [[
                    affiliate for affiliate in value[content]
                    if affiliate in all_channels
                ]]
            })
        else:
            tmp = pd.DataFrame({
                'ChannelId': [key],
                'ChannelTitle': [value['ChannelTitle']],
                'label':
                [determine_label(labels=value['SoftTags'], binary=binary)],
                'connections': [value[content]]
            })

        if len(tmp['connections'][0]) == 0:
            tmp.at[0, 'connections'] = ['None']
        # else:
        #     subs.append(len(tmp['connections'][0]))
        connections_df = connections_df.append(tmp)
    # print('Mean relations: %f  | Median Relations: %f' % (np.mean(subs), np.median(subs)))

    tmp = connections_df['connections'].apply(Counter)
    tmp = tmp.reset_index().drop(columns='index')
    encoded_connections_df = pd.DataFrame.from_records(
        tmp['connections']).fillna(value=0).drop(columns='None')
    encoded_connections_df['label'] = list(connections_df['label'])
    encoded_connections_df['id'] = list(connections_df['ChannelId'])
    encoded_connections_df['title'] = list(connections_df['ChannelTitle'])
    # encoded_connections_df.reset_index().drop(columns='index')
    end_time = time.time() - start_time
    print(f"finishing:  {content} after {end_time} second")

    if labeled:
        if binary:  # We want political and non political data in the dataset
            # return all labeled channels
            write_df(encoded_connections_df.iloc[:labeled_index],
                     content=content,
                     binary=binary,
                     labeled=labeled)
            return encoded_connections_df.iloc[:labeled_index]
        elif content == 'cross_comments' and not labeled:  # We only want the political data
            # return all labeled political channels
            # unfortunately there is no nice way to index non-scalar values
            for index, row in encoded_connections_df.iterrows():
                if row['label'] == ['UNLABELED']:
                    write_df(encoded_connections_df.iloc[:index],
                             content=content,
                             binary=binary,
                             labeled=labeled)

                    return encoded_connections_df.iloc[:index]

        else:
            # in this case the whole dataset only contains political and unlabeled channels
            write_df(encoded_connections_df.iloc[:labeled_index],
                     content=content,
                     binary=binary,
                     labeled=labeled)

            return encoded_connections_df.iloc[:labeled_index]
    else:
        write_df(
            encoded_connections_df.iloc[labeled_index:].reset_index().drop(
                columns='index'),
            content=content,
            binary=binary,
            labeled=labeled)

        return encoded_connections_df.iloc[labeled_index:].reset_index().drop(
            columns='index')
예제 #9
0
    subs = [int(channel['Subs']) for channel in NP_data.values()]
    subs.append(1)
    mean_subs = np.mean(subs)
    median = np.median(subs)
    std = np.std(subs)
    print(mean_subs)
    print(median)
    print(std)
    print(np.max(subs))
    print(np.min(subs))
    # x = 10 ** np.random.uniform(size=1000)
    plt.hist(subs, 50)
    # plt.hist(x, bins=10 ** np.linspace(0, 1, 10))
    min_ylim, max_ylim = plt.ylim()
    plt.axvline(mean_subs, color='k', linestyle='dashed', linewidth=1)
    plt.text(mean_subs * 1.1, max_ylim * 0.8, 'Mean: {:.2f}'.format(mean_subs))
    plt.axvline(median, color='g', linestyle='dashed', linewidth=1)
    plt.text(median * 0.58, max_ylim * 0.9, 'Median: {:}'.format(int(median)))
    plt.xscale('log')
    # plt.xlim(left, right)
    plt.xlabel('Number of Subscribers', size=13)
    plt.ylabel('Number of Channels', size=13)
    plt.show()


if __name__ == "__main__":
    P_data = read_channels('../output/final_data/P_channelData.json')
    NP_data = read_channels('../output/final_data/NP_channelData.json')

    plotSubscriptions(P_data, NP_data)
예제 #10
0
                        developerKey=self.api_keys[self.api_number + 1])
                    self.api_number += 1
                    # try again
                    return self.gather_info(videoId)
                else:
                    raise http_error


if __name__ == "__main__":
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
    load_dotenv()
    api_keys = os.getenv('API_KEYS').split(',')
    api_number = 0

    api_scraper = VideoSnippetAPIScraper(api_keys, api_number)

    channelData = read_channels(
        '../output/unlabeled_data/U_channelDataWithScrapedTopVideos.json')
    for channel in tqdm(channelData.keys()):
        videoIDs = [
            video for video in channelData[channel]['top3videos'] if not None
        ]
        video_snippets = []
        for videoId in videoIDs:
            video_snippets.append(api_scraper.gather_info(videoId))
        channelData[channel]['top3videos'] = video_snippets

    with open("../output/unlabeled_data/U_channelDataWithTopVideos.json",
              "w") as f:
        json.dump(channelData, f, indent=4)
import csv
import json

from tqdm import tqdm

from utils.data_IO import read_channels


def remove_NP_data(data):
    with open("unlabeled_data_predictions.csv",
              'r') as f:  # opening file in binary(rb) mode
        reader = csv.reader(f, delimiter=';', quotechar='|')
        next(reader, None)
        for row in tqdm(reader):
            if row[9] == '__label__non-political':
                data.pop(row[0], None)

    return data


if __name__ == '__main__':
    unlabeled_channels = read_channels(
        "../output/final_data/U_channelData.json")
    remove_NP_data(unlabeled_channels)
    with open("../output/final_data/U_channelData_multilabel.json", "w") as f:
        json.dump(unlabeled_channels, f, indent=4)
예제 #12
0
        channel['ChannelTitle'] = preprocess(channel['ChannelTitle'])
        channel['Description'] = preprocess(channel['Description'])
        for video in channel['top3videos']:
            video['title'] = preprocess(video['title'])
            video['description'] = preprocess(video['description'])
            if 'tags' in video.keys():
                for index, tag in enumerate(video['tags']):
                    video['tags'][index] = preprocess(tag)


if __name__ == '__main__':

    # p_data = read_channels("../output/P_channelData.json")
    # np_data = read_channels("../output/NP_channelData.json")
    # u_data = read_channels("../output/unlabeled_data/U_channelData.json")
    p_ml_data = read_channels("../output/P_channelData_multi_label.json")

    #
    # preprocess_dataset(p_data)
    # with open('../output/final_data/P_channelData.json', "w") as f:
    #     json.dump(p_data, f, indent=4)
    #
    # preprocess_dataset(np_data)
    # with open('../output/final_data/NP_channelData.json', "w") as f:
    #     json.dump(np_data, f, indent=4)
    #
    # preprocess_dataset(u_data)
    # with open('../output/final_data/U_channelData.json', "w") as f:
    #     json.dump(u_data, f, indent=4)

    preprocess_dataset(p_ml_data)