def double_linked_channels(): file = dict() for filename in os.listdir(CHANNEL_FOLDER): if filename.endswith(".json"): data = load_json(CHANNEL_FOLDER + filename) channel_id = data["channel_id"] file[channel_id] = [] featured_channels_ids = None if "featuredChannelsUrls" in data.keys(): featured_channels_ids = data["featuredChannelsUrls"] for id in featured_channels_ids: filepath = CHANNEL_FOLDER + id + ".json" data2 = load_json(filepath) #!!! get_channel(id) if data2 != None: featured_channels_ids2 = None if "featuredChannelsUrls" in data2.keys(): featured_channels_ids2 = data2[ "featuredChannelsUrls"] if channel_id in featured_channels_ids2: file[channel_id].append(data2["channel_id"]) else: file[channel_id].append(id) return file save_json(file, "/analytics/double_linked_channels.json")
def main(): print("Starting") if len(sys.argv) != 2: print("Usage: " + sys.argv[0] + " request_path.json") exit() filepath = str(sys.argv[1]) request = load_json(filepath) request_id = str(request["request_id"]) depth = int(request["depth"]) seed = [] seed.append(str(request["seed"])) print("Generating graph") graph = generate_graph(POLICY="cluster_policy", SEED=seed, DEPTH=depth) print("Analyzing graph") analytics = analyze_graph(graph) file = dict() file["request"] = request file["graph"] = graph file["analytics"] = analytics print("Saving graph") save_json(file, "response_" + request_id + ".json") print("Statistics") print("vertices", len(graph["vertices"])) print("edges", len(graph["edges"]))
def gather_keywords(): keywords = dict() for filename in os.listdir(CHANNEL_FOLDER): if filename.endswith(".json"): #print("Loading file " + filename) j = load_json(CHANNEL_FOLDER + filename) keywords[j["channel_id"]] = j["keywords"] return keywords
def main(): print("Starting...") if len(sys.argv) != 2: print("Usage: " + sys.argv[0] + " request_path.json") exit() filepath = str(sys.argv[1]) if True: #maybe you want to update everything for filename in os.listdir(CHANNEL_FOLDER): if filename.endswith(".json"): j = load_json(CHANNEL_FOLDER + filename) channel_id = j["channel_id"] channel_name = filename.replace(".json", "") SAVED_CHANNELS[channel_id] = channel_name print("Loaded " + str(len(SAVED_CHANNELS.keys())) + " saved channels") #for url in SAVED_CHANNELS: # print(SAVED_CHANNELS[url], "->", url) #read in request request = load_json(filepath) request_id = str(request["request_id"]) depth = int(request["depth"]) seed = [] seed.append(str(request["seed"])) print("Processing request", request_id) print("Starting crawl with seed", seed, " and depth", depth) crawl(SEED=seed, DEPTH=depth) print("Crawled", len(CRAWLED_CHANNELS.keys()), "with", NEW_CHANNELS, "new channels!") for url in CRAWLED_CHANNELS: print( str(CRAWLED_CHANNELS[url].encode("ascii", "ignore")) + " -> " + url) print("Done!")
def cluster_policy(SEED, DEPTH): #[get_channel(id) for id in data[current_id]] graph = dict() graph["vertices"] = [] graph["edges"] = [] graph["depth"] = DEPTH graph["vertex_count_at_depth"] = [] seed = SEED[0] linked_channels = load_json("analytics/double_linked_channels.json") traversed_channels_ids = [] current_channel_ids = [] current_channel_ids.append(seed) current_depth = 0 while current_depth <= DEPTH: print("current_depth " + str(current_depth)) next_ids = [] y_count = 0 for current_id in current_channel_ids: if current_id in linked_channels.keys(): vertex = dict() vertex["id"] = current_id vertex["coords"] = dict() vertex["coords"]["x"] = current_depth vertex["coords"]["y"] = y_count vertex["channel_data"] = get_channel(current_id) vertex["color"] = "white" graph["vertices"].append(vertex) if C_P_INFO: print("Created vertex for channel " + vertex["channel_data"]["title"]) for linked_id in linked_channels[current_id]: if linked_id not in traversed_channels_ids: if get_channel(linked_id) != None: edge_out_of_max_depth = False if current_depth != DEPTH: if linked_id not in next_ids and linked_id not in current_channel_ids: next_ids.append(linked_id) else: if linked_id not in current_channel_ids: edge_out_of_max_depth = True if not edge_out_of_max_depth: #this could be enhanced by only searching #in the current column edges redundant_edge = False for edge in graph["edges"]: source_id = edge["source_vertex_id"] dest_id = edge["dest_vertex_id"] if source_id == current_id and dest_id == linked_id: redundant_edge = True elif source_id == linked_id and dest_id == current_id: redundant_edge = True if not redundant_edge: edge = dict() edge["id"] = len(graph["edges"]) + 1 edge["source_vertex_id"] = current_id edge["dest_vertex_id"] = linked_id edge["color"] = "white" graph["edges"].append(edge) else: pass #print("Redundant edge") #print(current_id) #print(linked_id) if C_P_INFO: print("Created edge: " + vertex["channel_data"]["title"] + " -> " + get_channel(linked_id)["title"]) else: print("No data for id " + linked_id) #end of linked_id loop y_count += 1 #end of current_id loop graph["vertex_count_at_depth"].append(y_count) traversed_channels_ids.extend(current_channel_ids) current_channel_ids = next_ids current_depth += 1 return graph
def crawl_policy(SEED, DEPTH): global CHANNEL_FOLDER graph = dict() graph["vertices"] = [] graph["edges"] = [] graph["depth"] = DEPTH graph["vertex_count_at_depth"] = [] current_urls = [] current_urls.extend(SEED) next_urls = [] traversed_urls = [] current_depth = 0 while current_depth <= DEPTH: y_count = 0 for url in current_urls: if url not in traversed_urls: filename = CHANNEL_FOLDER + url + ".json" channel_data = None if url + ".json" in os.listdir(CHANNEL_FOLDER): channel_data = load_json(filename) if channel_data != None: vertex = dict() vertex["id"] = url vertex["coords"] = dict() vertex["coords"]["x"] = current_depth vertex["coords"]["y"] = y_count vertex["channel_data"] = channel_data vertex["color"] = "white" graph["vertices"].append(vertex) if current_depth != DEPTH: featuredChannelsUrls = channel_data[ "featuredChannelsUrls"] for featured_url in featuredChannelsUrls: if featured_url + ".json" in os.listdir( CHANNEL_FOLDER): if featured_url not in traversed_urls: next_urls.append(featured_url) edge = dict() edge["id"] = len(graph["edges"]) + 1 edge["source_vertex_id"] = url edge["dest_vertex_id"] = featured_url edge["color"] = "white" graph["edges"].append(edge) else: print("Error while loading file " + str(filename)) traversed_urls.append(url) y_count += 1 graph["vertex_count_at_depth"].append(y_count) current_urls = next_urls next_urls = [] current_depth += 1 return graph
SUB_ID["OpenMind3000"] = "UCEtYtMoD26j2BJBJ4w_hM6w" SUB_ID["EEVBlog"] = "UC2DjFE7Xf11URZqWBigcVOQ" SUB_ID["HandOfBlood"] = "UC9YTp5M6yYgSd6t0SeL2GQw" #Mapping of URL -> Title # URL == ChannelID (Identifier in YoutubeAPI) # Title (Name of channel) SAVED_CHANNELS = dict( ) #channels that have been crawled before and are known to the system CRAWLED_CHANNELS = dict() #channels that were crawled in this session #STATISTICS NEW_CHANNELS = 0 #youtube dependent stuff DEVELOPER_KEY = load_json("key.json")["key"] YOUTUBE_API_SERVICE_NAME = "youtube" YOUTUBE_API_VERSION = "v3" YOUTUBE = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=DEVELOPER_KEY) def get_channel_id(USERNAME): response = YOUTUBE.channels().list(forUsername=USERNAME, part="id").execute() return response["items"][0]['id'] def get_featured_channels(CHANNEL_ID): response = YOUTUBE.channels().list(id=CHANNEL_ID,