# -*- coding: utf-8 -*- import os import sys # Path for spark source folder from pymongo import MongoClient from DAOCollectionsFriends import TwitterFriendsCollections from DAOTwitterUsers import TwitterUsers client = MongoClient("127.0.0.1", 27017, connect=True) db = client["SilverEye"] twitter_users = TwitterUsers(client, "SilverEye") twitter_friends_collections = TwitterFriendsCollections(client, "SilverEye") match = 0 for user in twitter_users.get_all_users(): for collection in twitter_friends_collections.get_all_collections(): for friends in collection["fiends"]: if friends == user["friends"]: match += 1 twitter_users.add_collection_result(user["_id"], collection["_id"], match)
#%matplotlib inline import matplotlib import numpy as np import matplotlib.pyplot as plt sc = pyspark.SparkContext('local[*]') sqlContext = SQLContext(sc) sqlContext.setConf("spark.sql.caseSensitive", "true"); logger = sc._jvm.org.apache.log4j logger.LogManager.getRootLogger().setLevel(logger.Level.FATAL) client = MongoClient("127.0.0.1", 27017, connect=True) db = client["SilverEye"] twitter_users = TwitterUsers(client, "SilverEye") collections_friends_dao = TwitterFriendsCollections(client, "SilverEye") users_related_array =twitter_users.get_users_and_friends_array() users_related = sc.parallelize(users_related_array) users_related_rdd = users_related.map(lambda row: {"id": row[0], "friends": row[1]}) users_related_rdd.cache() #print users_related_rdd.collect() result_df_7 = sqlContext.read.parquet("result.df.7") result_df_7 = result_df_7.selectExpr("id as id", "label as label7")
#%matplotlib inline import matplotlib import numpy as np import matplotlib.pyplot as plt sc = pyspark.SparkContext('local[*]') sqlContext = SQLContext(sc) sqlContext.setConf("spark.sql.caseSensitive", "true") logger = sc._jvm.org.apache.log4j logger.LogManager.getRootLogger().setLevel(logger.Level.FATAL) client = MongoClient("127.0.0.1", 27017, connect=True) db = client["SilverEye"] twitter_users = TwitterUsers(client, "SilverEye") collections_friends_dao = TwitterFriendsCollections(client, "SilverEye") result_df_4_5_6_7 = sqlContext.read.parquet("result.df.min") result_df_4_5_6_7.show() #print result_df_4_5_6_7.count() result_df_political = sqlContext.read.parquet("result.df.f_political") #result_df_political.show() clusters_result = result_df_political.join( result_df_4_5_6_7, result_df_political["identifier"] == result_df_4_5_6_7["id"]) clusters_result = clusters_result.drop("id")
) sc = pyspark.SparkContext('local[*]') sqlContext = SQLContext(sc) sqlContext.setConf("spark.sql.caseSensitive", "true"); logger = sc._jvm.org.apache.log4j logger.LogManager.getRootLogger().setLevel(logger.Level.FATAL) print spark_home, sc from graphframes import GraphFrame from pyspark.sql.functions import col client = MongoClient("127.0.0.1", 27017, connect=True) db = client["SilverEye"] twitter_users = TwitterUsers(client, "SilverEye") graph_users_extractor = GraphUsersExtractor(twitter_users) edges = graph_users_extractor.get_edges() vertex = graph_users_extractor.get_vertex() v = sqlContext.createDataFrame(vertex, ["id", "namespace"]) # Create an Edge DataFrame with "src" and "dst" columns e = sqlContext.createDataFrame(edges, ["src", "dst", "relationship"]) # Create a GraphFrame g = GraphFrame(v, e) g.vertices.show() g.edges.show() result_df_political = sqlContext.read.parquet("result.df.f_political")