def initialize_params(self, partitions=2100, cores=5, memory=11): conf = SparkConf() conf.set('spark.sql.shuffle.partitions', str(partitions)) conf.set("spark.executor.cores", str(cores)) SparkContext.setSystemProperty('spark.executor.memory', str(memory) + 'g') SparkContext.setSystemProperty('spark.driver.memory', str(memory) + 'g') self.sc = SparkContext(appName='mm_exp', conf=conf) self.sqlContext = pyspark.SQLContext(self.sc)
def start_spark_sentiment_analysis(hashtag): sc = ps.SparkContext('local[*]') sqlContext = ps.SQLContext(sc) tokenizer = Tokenizer(inputCol="Tweets", outputCol="words") remover = StopWordsRemover(inputCol="words", outputCol="base_words") hashingTF = HashingTF(numFeatures=10000, inputCol="base_words", outputCol="features") lr = LogisticRegression(featuresCol="features", labelCol="Sentiment", elasticNetParam=0.8, regParam=0.001, family="multinomial") pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr]) partsDF = pandas.read_csv("./usr/local/static/tweet_sentiment.csv") datasize = partsDF.shape[0] trainSet = partsDF.sample(frac=0.5, replace=False) trainSet = sqlContext.createDataFrame(trainSet) partsDF = sqlContext.createDataFrame(partsDF) lrModel = pipeline.fit(trainSet) lrResult = lrModel.transform(partsDF) avg = round( lrResult.where('Sentiment == prediction').count() / datasize, 2) neutral = round(lrResult.where('prediction==0').count(), 2) supportive = round(lrResult.where('prediction==1').count(), 2) Against = round(lrResult.where('prediction==2').count(), 2) print("\n\n\n\n\n|----------------------##----------------------|") print("Accuracy=\t", avg, "%") print("Neutral=\t", neutral, "%") print("Supportive=\t", supportive, "%") print("Against=\t", Against, "%") frequencies = [int(supportive), int(neutral), int(Against)] freq_series = pandas.Series.from_array(frequencies) x_labels = ['Positive Tweets', 'Neutral Tweets', 'Negative Tweets'] title = 'Sentimental Analysis on Twitter Data ' + hashtag # Plot the figure. plt.figure(figsize=(14, 10)) ax = freq_series.plot(kind='bar', color="green") ax.set_title(title, fontsize=24, weight='bold') ax.set_xlabel('Sentiment', fontsize=18, weight='bold') ax.set_ylabel('Frequency', fontsize=18, weight='bold') ax.set_xticklabels(x_labels, fontsize=18, weight='bold', rotation=0) plt.savefig("./usr/local/static/result.png")
def pyspark_setting(): """ Setup pyspark :return: """ conf = pyspark.SparkConf() sc = pyspark.SparkContext() aws_id = os.environ.get('AWS_ACCESS_KEY_ID') aws_key = os.environ.get('AWS_SECRET_ACCESS_KEY') s3 = boto3.resource('s3') sqlContext = pyspark.SQLContext(sc)
def get_spark_session(app_name=SPARK_APP_NAME): # configure conf = pyspark.SparkConf() conf.set('spark.app.name', app_name) conf.set('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') conf.set('spark.debug.maxToStringFields', 1000) # init & return sc = pyspark.SparkContext.getOrCreate(conf=conf) sc.setLogLevel('WARN') return pyspark.SQLContext(sparkContext=sc)
def __init__(self, spark_master=None, app_name=None, spark_cores=2, spark_memory="1g", ceph_access_key=None, ceph_secret_key=None, ceph_host_url=None): if not spark_master: if os.getenv('SPARK_LOCAL') == "True": spark_master = 'local[2]' spark_cores = 2 spark_memory = "1g" print("Using local spark") pass else: spark_master = "spark://" + os.getenv( 'OSHINKO_CLUSTER_NAME') + ":7077" pass if not app_name: inst = ''.join( random.choices(string.ascii_uppercase + string.digits, k=4)) app_name = inst + ' - Ephemeral Spark Application' #Set the configuration print("Application Name: ", app_name) self.spark_settings = { 'spark_master': spark_master, 'app_name': app_name, 'spark_cores': spark_cores, 'spark_memory': spark_memory } conf = pyspark.SparkConf().setAppName( self.spark_settings['app_name']).setMaster(spark_master) conf.set("spark.cores.max", str(self.spark_settings['spark_cores'])) conf.set("spark.executor.memory", self.spark_settings['spark_memory']) #Set the Spark cluster connection self.sc = pyspark.SparkContext.getOrCreate(conf) #Set the Hadoop configurations to access Ceph S3 self.sc._jsc.hadoopConfiguration().set( "fs.s3a.access.key", os.getenv('DH_CEPH_KEY', ceph_access_key)) self.sc._jsc.hadoopConfiguration().set( "fs.s3a.secret.key", os.getenv('DH_CEPH_SECRET', ceph_secret_key)) self.sc._jsc.hadoopConfiguration().set( "fs.s3a.endpoint", os.getenv('DH_CEPH_HOST', ceph_host_url)) #Get the SQL context self.sqlContext = pyspark.SQLContext(self.sc)
def main(): """ Program entrypoint, orchestrates the pipeline""" args = parse_args() tfidf = create_tf_idf(args.input) information_extractor = InformationExtractor(args.vectors, read_gazetter(args.brands), read_gazetter(args.styles), read_gazetter(args.materials), read_gazetter(args.items), read_gazetter(args.probasebrands), read_gazetter(args.probasematerials), read_gazetter(args.patterns), read_gazetter(args.itemtopcategory), args.deepdetect, args.conf, tfidf) sc = pyspark.SparkContext(conf=sparkConf()) sql = pyspark.SQLContext(sc) analyze_user(information_extractor, sql, args)
def __connected_spark_cluster(self, resource_url, pilot_description=None): conf = pyspark.SparkConf() conf.setAppName("Pilot-Spark") if pilot_description != None: for i in list(pilot_description.keys()): if i.startswith("spark"): conf.set(i, pilot_description[i]) conf.setMaster(resource_url) print((conf.toDebugString())) sc = pyspark.SparkContext(conf=conf) sqlCtx = pyspark.SQLContext(sc) pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx) return pilot
def _test(): """ Unit test function for PairwiseEuclideanDistance class. """ try: if isinstance(sc, pyspark.SparkContext): print("SparkContext:", sc) except Exception as ex: sc = pyspark.SparkContext() print("Created SparkContext:", sc) try: if isinstance(sqlContext, pyspark.SQLContext): print("SQLContext:", sqlContext) except Exception as ex: sqlContext = pyspark.SQLContext(sc) print("Created SQLContext:", sqlContext) # Generate dummy vectors rdd_data = [(pyspark.mllib.linalg.Vectors.dense([0.0, 1.0]), ), (pyspark.mllib.linalg.Vectors.dense([1.0, 1.0]), ), (pyspark.mllib.linalg.Vectors.dense([2.0, 3.0]), ), (pyspark.mllib.linalg.Vectors.dense([8.0, 9.0]), )] df_data = sqlContext.createDataFrame(rdd_data, ["feature"]) df_data.show() df_data.collect() # Instantiate a pairwise Euclidean distance transformer ped = PairwiseEuclideanDistance(squared=False, inputCol="feature", outputCol="distance") # Get squared Euclidean pairwise distances ped.setSquared(True) ped.getSquared() ped.getInputCol() ped.hasDefault("squared") print(ped.explainParams()) ped.transform(df_data).show() # Get Euclidean pairwise distances ped.setSquared(False) print(ped.explainParams()) ped.transform(df_data).show() # Shutdown SparkContext sc.stop() print("SparkContext is shutdown.")
def __init__(self, topic, spark_ip="local[2]", kafka_ip="localhost:9092"): self.topic = topic self.kafka_ip = kafka_ip sc = SparkContext(spark_ip, appName="WikiStream") self.spark = pyspark.SQLContext(sc) self.df = self.spark \ .read \ .format("kafka") \ .option("kafka.bootstrap.servers", self.kafka_ip) \ .option("subscribe", self.topic) \ .option("startingOffsets", "earliest") \ .load() self.events = self.df.withColumn("value", F.col("value").cast(StringType())) \ .withColumn("value", F.from_json("value", MAIN_SCHEMA)) \ .select("value.data.*") \ .withColumn("time", F.col("meta.dt").cast("timestamp"))
def sql_context(self, application_name): """Create a spark context given the parameters configured in this class. The caller is responsible for calling ``.close`` on the resulting spark context Parameters ---------- application_name : string Returns ------- sc : SparkContext """ sc = self.spark_context(application_name) import pyspark sqlContext = pyspark.SQLContext(sc) return (sc, sqlContext)
def __init__(self, spark_context, initial_data_path, sep='\\s+', row_limit=None, skip_rows=None): if initial_data_path is not None: self.sql_context = pyspark.SQLContext(spark_context) self.initial_data = self.sql_context.createDataFrame( pd.read_csv( initial_data_path, sep=sep, nrows=row_limit, skiprows=skip_rows ), ['f', 't'] ) self.old_data = self.sql_context.createDataFrame(pd.DataFrame([[0, 1]], columns=['f', 't'])) self.data = self.initial_data self.final_output = None else: raise ValueError("File path shouldn't be None.")
def sqlite2parquet(db_path, output_dir, skip_tables=['sqlite_sequence']): if args.spark: # load Spark configuration conf = pyspark.SparkConf() conf.set('spark.executor.memory', '4g') #conf.set('spark.sql.parquet.compression.codec', 'gzip') # We recommend snappy because it is splittable # https://www.cloudera.com/documentation/enterprise/5-3-x/topics/admin_data_compression_performance.html conf.set('spark.sql.parquet.compression.codec', 'snappy') sc = pyspark.SparkContext("local", conf=conf) conn = sqlite3.connect(db_path) tables = get_table_list(conn) for table in tables: if table in skip_tables: print("Skipping: {0}".format(table)) continue print("Converting: {0}".format(table)) logging.info("Converting: {0}".format(table)) gen = get_generator_from_table(conn, table) schema = get_column_names_from_table(conn, table) print("schema: ",schema) if args.spark: print("converting to data-frame") print("column names: {}".format(schema)) a = sc.parallelize(gen) a.persist(StorageLevel.DISK_ONLY) sqlContext = pyspark.SQLContext(sc) df = sqlContext.createDataFrame(a, schema=schema, samplingRatio=None) fname = os.path.join(output_dir, table + '.parquet') print("\t saving...") df.saveAsParquetFile(fname) else: print("Running --no-spark") print("Here are the first five rows:") i = 0 for row in gen: print("#{}: ".format(i),end=" ") for x in row: print(x,end=" ") print() i += 1 if i>5: break
def create_spark_context(aws_conn_id): """creates the spark session""" spark = (SparkSession.builder.config( "spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0").getOrCreate()) spark.sparkContext.setLogLevel("INFO") sc = spark.sparkContext sc = pyspark.SQLContext(sc) aws_hook = AwsHook(aws_conn_id) credentials = aws_hook.get_credentials() spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", credentials.access_key) spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", credentials.secret_key) return sc
def main(): # Path to Parquet files parquet_dir = './data' # Create test data table_list = ['table1', 'table2'] column_list = [['a', 'b', 'c'], ['d', 'e', 'f']] for table, columns in zip(table_list, column_list): save_table( pd.DataFrame(np.random.randint(low=0, high=10, size=(5, 3)), columns=columns), os.path.join(parquet_dir, 'db.sqlite'), table, lambda name: create_engine('sqlite:///' + name)) # Set up Spark context conf = pyspark.SparkConf() args = ( ('spark.executor.memory', '4g'), ('spark.sql.parquet.compression.codec', 'gzip'), # ('spark.sql.parquet.compression.codec', 'snappy') ) map(lambda args: conf.set(*args), args) spark_context = pyspark.SparkContext("local", conf=conf) sql_context = pyspark.SQLContext(spark_context) # Save Parquet file save_to_parquet(spark_context, sql_context, create_sqlite(os.path.join(parquet_dir, 'db.sqlite')), table_list, parquet_dir) # Read Parquet file table_df = sql_context.read.parquet(os.path.join(parquet_dir, 'table1')) print(table_df.count()) print(table_df.head())
def apply_model(sc, path_eval, output_path="../output/evaluation_rating.csv"): """ Apply the model previously built to the evaluation file :param path_eval: csv file path :return: None, save the file in an output folder """ features_rating = StructType([ StructField("userId", FloatType(), True), StructField("movieId", FloatType(), True), ]) model_als = ALSModel.load("../als_model") df = pyspark.SQLContext(sc).read.format("csv").schema( features_rating).option("header", True).load(path_eval) predictions = model_als.transform(df) predictions.coalesce(1)\ .write.format("com.databricks.spark.csv")\ .option("header", "true")\ .save(output_path)
def run(string_content): try: content, user_input = parse_string_to_numeric(string_content) sc = pyspark.SparkContext.getOrCreate() sqlContext = pyspark.SQLContext(sc) edge_pairs = sc.textFile(LOCAL_DATA_PATH) Dk = getUVDFfromUndirectedEdgePairsRDD(sqlContext, edge_pairs, base_coin_functions) Dd = getUVSecondCircleDFfromUndirectedEdgePairsRDD( sqlContext, edge_pairs, base_coin_functions) res = get_plausible_filtered(sqlContext, Dk, Dd, base_coin_functions) writeDBResource(res) print( "-----------------------------saved into people--------------------------------" ) res = res.filter((col(A_NODE) == content) | (col(B_NODE) == content)).sort(desc(WEIGHT)).take(4) results_map = get_results_map(content, user_input, res) return results_map except Exception as e: print("error -------------> \n", e) return e.__str__()
# In[3]: # Define the sqlContext sqlContext = SQLContext(sc) # Define the hive context hiveContext = HiveContext(sc) # Create the spark session. ss = pyspark.sql.SparkSession(sc) spark = ss.builder.master("local").appName("Word Count").config( "spark.some.config.option", "some-value").getOrCreate() # Create sqlCtx object. # CSV are accessed as sql tables using this. sqlCtx = pyspark.SQLContext(sc) # ## API: getTime and plotResults # # - plotResults is the utility function that will plot the compressed and reconstructed data. # In[4]: def getTime(x, dfTest): return dfTest.at[int(x), 'timeseries'] def plotResults(dfs, plotTemplates): fig, ax = plt.subplots(figsize=(15, 8)) ax.set_title('compression analysis')
import pyspark as spark import pandas as pd from pyspark.sql.types import * from pyspark.sql import SQLContext from pyspark.sql.functions import col, udf, monotonically_increasing_id, unix_timestamp, round, avg import re from pyspark.sql import SparkSession sc = spark.SparkContext() sql = spark.SQLContext(sc) sc.setLogLevel("OFF") #sparkSession = SparkSession.builder.appName("BaseServer").getOrCreate() #TweetSDF = sparkSession.read.csv('hdfs://localhost/user/team14/tweets.csv') #print('===========================================================================') #print('Tweets counts : ', TweetSDF.count()) TweetPD = pd.read_csv('./meta/tweets.csv', error_bad_lines=False, engine='python', header=None) TweetSDF = sql.createDataFrame(TweetPD) TweetSDF = TweetSDF.dropna() print( '===========================================================================' ) print('Tweets counts : ', TweetSDF.count()) #sparkSession = SparkSession.builder.appName("BaseServer").getOrCreate()
def to_spark_df(self): sc = get_spark_context() if sc: import pyspark return pyspark.SQLContext(sc).createDataFrame(self.to_df())
# coding: utf-8 from pyspark import SparkContext as sc from pyspark.sql.functions import concat_ws import pyspark import sys import requests import json # sys.setdefaultencoding() does not exist, here! reload(sys) # Reload does the trick! sys.setdefaultencoding('UTF8') r = requests.get('http://0.0.0.0:5000/jsonresult') with open('/root/rezerv/output.json', 'w') as outfile: json.dump(r.json(), outfile) sqlContext = pyspark.SQLContext(pyspark.SparkContext()) df = sqlContext.read.json('file:///root/rezerv/output.json') df.select('geometry.location.lat', 'geometry.location.lng', 'name', concat_ws(',', 'types'), 'rating', 'vicinity').write.csv('/datastorage/poidata.csv')
def sqlcontext(): sc = pyspark.SparkContext.getOrCreate() return (pyspark.SQLContext(sc), sc)
# In[ ]: sc = pyspark.SparkContext() # In[ ]: sc # In[ ]: sql_sc = pyspark.SQLContext(sc) Next, we change the column names to better understand what we are dealing with. We only rename the unique columns, we will do the others later. # In[ ]: def convert_prices_to_ddf(file): res_ddf = (sql_sc .read .option('header', 'false') .option('sep', ';') .option('inferSchema', 'true') .csv(file) .selectExpr( '_c0 as station_id', '_c1 as post_code',
def test_linear_regression(): """ Test the linear regression model on the cleaned data :param sc: Spark context :return: None """ conf = (pyspark.SparkConf().setAppName('test').set( "spark.executor.memory", "2g").setMaster("local[2]")) sc = pyspark.SparkContext(conf=conf) path_rating = "../data/rating_with_movie_data.csv" df = pyspark.SQLContext(sc).read.format("csv").option( "header", True).load(path_rating) columns_to_drop = [ 'timestamp', 'imdbId', 'tmdbId', 'imdb_id', 'release_date' ] df = df.drop(*columns_to_drop) for col_name in df.columns: df = df.withColumn(col_name, col(col_name).cast(FloatType())) df = df.fillna(0) assembler = VectorAssembler( inputCols=([x for x in df.columns if x not in ['rating']]), outputCol="features") pipeline = Pipeline(stages=[assembler]) pipelineModel = pipeline.fit(df) df = pipelineModel.transform(df) selected_cols = ['features', 'rating'] df = df.select(selected_cols) (trainingData, testData) = df.randomSplit([0.8, 0.2]) lr = LinearRegression(featuresCol='features', labelCol='rating', maxIter=10) # Chain indexer and forest in a Pipeline pipeline = Pipeline(stages=[lr]) evaluator = RegressionEvaluator(labelCol="rating", predictionCol="prediction", metricName="rmse") paramGrid = ParamGridBuilder() \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), numFolds=2) # Train model. This also runs the indexer. model = crossval.fit(df) # Make predictions. predictions = model.transform(trainingData) predictions.show() # Select example rows to display. predictions.select("prediction", "rating", "features").show(5) # Select (prediction, true label) and compute test error rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
def nlp_pipeline(filename, max_length, dict_length, num_executors, dict_file, test_split, num_classes, sc): sql = pyspark.SQLContext(sc) # Load csv file with raw samples print('{} Loading input data'.format(datetime.now())) if num_classes == 2: csv_data = sql.read.csv(filename, sep="\t", inferSchema=True, header=True) \ .where('star_rating in (1, 2, 4, 5)') else: csv_data = sql.read.csv(filename, sep="\t", inferSchema=True, header=True) \ .where('star_rating in (1, 2, 3, 4, 5)') if test_split == None: start_data = csv_data test_raw_data = None else: # Take apart test data from the beginning (if needed) (start_data, test_raw_data) = csv_data.randomSplit([1 - test_split, test_split], seed=SEED) raw_data = start_data.rdd.cache() # Create a new RDD with a tuple of text and label data = raw_data.map(lambda sample: (sample.review_body, sample.star_rating)) # Clean data by removing murkups, urls, emails, etc. print('{} Cleaning text'.format(datetime.now())) clean_rdd = data.map(lambda sample: clean_data(sample)) # Split sentences in tokens print('{} Splitting text in tokens'.format(datetime.now())) words_rdd = clean_rdd.map(lambda sample: tokenizer(sample)) # Remove stop words and punctuations from sentences print('{} Removing stop words and punctuation'.format(datetime.now())) clean_words_rdd = words_rdd.map( lambda sample: remove_stop_words_and_punc(sample)) # Get the lemma of each words print('{} Lemmatization'.format(datetime.now())) lemma_words_rdd = clean_words_rdd.map(lambda sample: lemmatize(sample)) \ .cache() if dict_file == None: # Create a dictionary of distinct words and index print('{} Creating dictionary'.format(datetime.now())) dictionary = create_dictionary(lemma_words_rdd, dict_length) else: print('{} Loading dictionary'.format(datetime.now())) dictionary = load_dictionary(dict_file) # Broadcast the dictionary in order to have it in all workers dict_broad = sc.broadcast(dictionary) # Substitute words with its index print('{} Substituting words with indexes'.format(datetime.now())) index_rdd = lemma_words_rdd.map( lambda sample: replace_word(sample, dict_broad)) # Remove empty samples print('{} Removing empty samples'.format(datetime.now())) filtered_rdd = index_rdd.filter( lambda sample: np.sum(np.asarray(sample[0], dtype=np.float32)) > 0) # Normalize input indexes. Commented because it gets worse #print('{} Normalizing data'.format(datetime.now())) #norm_rdd = filtered_rdd.map(lambda sample: normalize(sample, dict_length)) norm_rdd = filtered_rdd # Pad or trim samples in order to have the same lenght. print('{} Padding arrays'.format(datetime.now())) padded_rdd = norm_rdd.map( lambda sample: trim_or_pad_samples(sample, max_length)) # One hot encoding of label print('{} One hot encoding labels'.format(datetime.now())) if num_classes == 2: final_rdd = padded_rdd.map( lambda sample: sample[0] + label_OHE_bin[sample[1]]) else: final_rdd = padded_rdd.map( lambda sample: sample[0] + label_OHE[sample[1]]) # Convert de RDD to a Dataframe and fills with 0 null values final_df = final_rdd.toDF() \ .na.fill(0) return final_df, dictionary, test_raw_data
def setUpClass(cls): conf = pyspark.SparkConf().setMaster("local[2]").setAppName("testing") cls.sc = pyspark.SparkContext(conf=conf) cls.spark = pyspark.SQLContext(cls.sc)
# predict new rating if item_rate_list[i][0] in neigh_dict.keys(): sim = neigh_dict[item_rate_list[i][0]] numer += (sim * item_rate_list[i][1]) denom += sim if denom > 0: pred_rating = numer / denom return user, pred_rating neigh_dict = {} sc = pyspark.SparkContext.getOrCreate() given_items = eval(sys.argv[2]) ip = pyspark.SQLContext(sc).read.option("header", "true").json(sys.argv[1]) ip_rdd = ip.select("overall", "reviewerID", "asin").rdd.map(list) ''' Filtering - only one rating per user per item The data is by default in the descending order of review time. Grouping by (item, user) and taking the first rating from the list of ratings as that would be the most recent. ''' filter_one = ip_rdd.map(lambda x: ((x[2], x[1]), x[0])).combineByKey( lambda x: [x], lambda u, v: u + [v], lambda x, y: x + y) filtered_data = filter_one.map(lambda x: (x[0][0], x[0][1], list(x[1])[0])) items_users_group = filtered_data.map( lambda x: (x[0], (x[1], x[2]))).combineByKey( lambda x: [x], lambda u, v: u + [v], lambda x, y: x + y).map( lambda x: (x[0], list(x[1]))).filter(lambda x: len(x[1]) >= 25) filter_items = set(items_users_group.map(lambda x: x[0]).collect()) # The big dataset might yield more than 1000 items or users that is why not broadcasting the set.
import multiprocessing import pyspark as ps from lsa import LatentSemanticAnalysis from reviews import get_beer_reviews_dataframe if __name__ == '__main__': cpu = 'local[{}]'.format(multiprocessing.cpu_count()) sc = ps.SparkContext(cpu) print "Just created a SparkContext" sql_context = ps.SQLContext(sc) print "Just created a SQLContext" df = sql_context.read.json('../data/reviews.json') print "df" df_reviews = df.select('brewery_name', 'beer_name', 'state', 'beer_style', 'avg_rating', 'text') print "df_reviews" try: df_beer_reviews = get_beer_reviews_dataframe(df_reviews) except: pass df_beer_reviews = get_beer_reviews_dataframe(df_reviews) df_beer_reviews.persist(ps.StorageLevel.MEMORY_AND_DISK) print "df_beer_reviews" print df_beer_reviews.count() lsa_ = LatentSemanticAnalysis(sc, sql_context, df_beer_reviews) lsa_.transform(n_components=500) # sc.stop()
path = "test" cs.entries.toDF().write.parquet(path) cs.entries.toDF().coalesce(1)\ .write.format("com.databricks.spark.csv")\ .option("header", "true")\ .save("testtest.csv") if __name__ == "__main__": path_rating = "../data/ratings.csv" conf = (pyspark.SparkConf().setAppName('test').set( "spark.executor.memory", "2g").setMaster("local[4]")).set('spark.sql.pivotMaxValues', u'50000') sc = pyspark.SparkContext(conf=conf) features_rating = StructType([ StructField("userId", FloatType(), True), StructField("movieId", FloatType(), True), StructField("rating", FloatType(), True), StructField("timestamp", StringType(), True) ]) df = pyspark.SQLContext(sc).read.format("csv").schema( features_rating).option("header", True).load(path_rating) df = df.select("userId", "movieId", "rating") compute_similarity(df)
### --- dir structure and params hdfsPath = params['hdfsPath'] dataDir = params['dataDir'] # - Spark Session sc = SparkSession\ .builder\ .appName("WD Inequality")\ .enableHiveSupport()\ .getOrCreate() # - Spark Session Log Level: INFO sc.sparkContext.setLogLevel("INFO") # - SQL context sqlContext = pyspark.SQLContext(sc) ### --- get wmf.mediawiki_history snapshot snaps = sqlContext.sql('SHOW PARTITIONS wmf.mediawiki_history') snaps = snaps.toPandas() mwwikiSnapshot = snaps.tail(1)['partition'].to_string() mwwikiSnapshot = mwwikiSnapshot[-7:] currentMonth = mwwikiSnapshot currentYear = mwwikiSnapshot[0:4] ### --- Edits distribution: since the beginning of time and until current snapshot wdri = sqlContext.sql( 'SELECT event_user_id, event_user_is_bot_by FROM wmf.mediawiki_history WHERE event_entity="revision" AND event_type="create" AND wiki_db="wikidatawiki" AND page_namespace=0 AND snapshot="' + mwwikiSnapshot + '"') wdri = wdri.withColumn("bot_name", array_contains(col("event_user_is_bot_by"), "name"))
import numpy as np import pandas as pd import pyspark import logging FORMAT = '%(asctime)-15s %(clientip)s %(user)-8s %(message)s' logging.basicConfig(format=FORMAT) logger = logging.Logger('Main') try: sc = pyspark.SparkContext(appName="Test") except Exception as err: logger.warning(err) sql = pyspark.SQLContext(sc) rdd = sc.parallelize([ ('A', 1), ('B', 5), ('A', 6), ('B', 9), ('A', 2) ]) def func(x): key = x[0] value = x[1] print(value) arr1 = [ ["Movies" , np.array([1.0, 2.5], dtype=np.float32)],