예제 #1
0
 def initialize_params(self, partitions=2100, cores=5, memory=11):
     conf = SparkConf()
     conf.set('spark.sql.shuffle.partitions', str(partitions))
     conf.set("spark.executor.cores", str(cores))
     SparkContext.setSystemProperty('spark.executor.memory',
                                    str(memory) + 'g')
     SparkContext.setSystemProperty('spark.driver.memory',
                                    str(memory) + 'g')
     self.sc = SparkContext(appName='mm_exp', conf=conf)
     self.sqlContext = pyspark.SQLContext(self.sc)
def start_spark_sentiment_analysis(hashtag):
    sc = ps.SparkContext('local[*]')
    sqlContext = ps.SQLContext(sc)

    tokenizer = Tokenizer(inputCol="Tweets", outputCol="words")
    remover = StopWordsRemover(inputCol="words", outputCol="base_words")
    hashingTF = HashingTF(numFeatures=10000,
                          inputCol="base_words",
                          outputCol="features")
    lr = LogisticRegression(featuresCol="features",
                            labelCol="Sentiment",
                            elasticNetParam=0.8,
                            regParam=0.001,
                            family="multinomial")

    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr])

    partsDF = pandas.read_csv("./usr/local/static/tweet_sentiment.csv")
    datasize = partsDF.shape[0]
    trainSet = partsDF.sample(frac=0.5, replace=False)

    trainSet = sqlContext.createDataFrame(trainSet)
    partsDF = sqlContext.createDataFrame(partsDF)
    lrModel = pipeline.fit(trainSet)

    lrResult = lrModel.transform(partsDF)

    avg = round(
        lrResult.where('Sentiment == prediction').count() / datasize, 2)
    neutral = round(lrResult.where('prediction==0').count(), 2)
    supportive = round(lrResult.where('prediction==1').count(), 2)
    Against = round(lrResult.where('prediction==2').count(), 2)

    print("\n\n\n\n\n|----------------------##----------------------|")
    print("Accuracy=\t", avg, "%")
    print("Neutral=\t", neutral, "%")
    print("Supportive=\t", supportive, "%")
    print("Against=\t", Against, "%")

    frequencies = [int(supportive), int(neutral), int(Against)]

    freq_series = pandas.Series.from_array(frequencies)

    x_labels = ['Positive Tweets', 'Neutral Tweets', 'Negative Tweets']
    title = 'Sentimental Analysis on Twitter Data ' + hashtag
    # Plot the figure.
    plt.figure(figsize=(14, 10))
    ax = freq_series.plot(kind='bar', color="green")
    ax.set_title(title, fontsize=24, weight='bold')
    ax.set_xlabel('Sentiment', fontsize=18, weight='bold')
    ax.set_ylabel('Frequency', fontsize=18, weight='bold')
    ax.set_xticklabels(x_labels, fontsize=18, weight='bold', rotation=0)

    plt.savefig("./usr/local/static/result.png")
예제 #3
0
def pyspark_setting():
    """
    Setup pyspark
    :return:
    """
    conf = pyspark.SparkConf()
    sc = pyspark.SparkContext()
    aws_id = os.environ.get('AWS_ACCESS_KEY_ID')
    aws_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
    s3 = boto3.resource('s3')
    sqlContext = pyspark.SQLContext(sc)
예제 #4
0
def get_spark_session(app_name=SPARK_APP_NAME):
    # configure
    conf = pyspark.SparkConf()
    conf.set('spark.app.name', app_name)
    conf.set('spark.serializer', 'org.apache.spark.serializer.KryoSerializer')
    conf.set('spark.debug.maxToStringFields', 1000)

    # init & return
    sc = pyspark.SparkContext.getOrCreate(conf=conf)
    sc.setLogLevel('WARN')
    return pyspark.SQLContext(sparkContext=sc)
예제 #5
0
    def __init__(self,
                 spark_master=None,
                 app_name=None,
                 spark_cores=2,
                 spark_memory="1g",
                 ceph_access_key=None,
                 ceph_secret_key=None,
                 ceph_host_url=None):

        if not spark_master:
            if os.getenv('SPARK_LOCAL') == "True":
                spark_master = 'local[2]'
                spark_cores = 2
                spark_memory = "1g"
                print("Using local spark")
                pass
            else:
                spark_master = "spark://" + os.getenv(
                    'OSHINKO_CLUSTER_NAME') + ":7077"
        pass
        if not app_name:
            inst = ''.join(
                random.choices(string.ascii_uppercase + string.digits, k=4))
            app_name = inst + ' - Ephemeral Spark Application'
        #Set the configuration
        print("Application Name: ", app_name)

        self.spark_settings = {
            'spark_master': spark_master,
            'app_name': app_name,
            'spark_cores': spark_cores,
            'spark_memory': spark_memory
        }

        conf = pyspark.SparkConf().setAppName(
            self.spark_settings['app_name']).setMaster(spark_master)

        conf.set("spark.cores.max", str(self.spark_settings['spark_cores']))
        conf.set("spark.executor.memory", self.spark_settings['spark_memory'])

        #Set the Spark cluster connection
        self.sc = pyspark.SparkContext.getOrCreate(conf)

        #Set the Hadoop configurations to access Ceph S3
        self.sc._jsc.hadoopConfiguration().set(
            "fs.s3a.access.key", os.getenv('DH_CEPH_KEY', ceph_access_key))
        self.sc._jsc.hadoopConfiguration().set(
            "fs.s3a.secret.key", os.getenv('DH_CEPH_SECRET', ceph_secret_key))
        self.sc._jsc.hadoopConfiguration().set(
            "fs.s3a.endpoint", os.getenv('DH_CEPH_HOST', ceph_host_url))

        #Get the SQL context
        self.sqlContext = pyspark.SQLContext(self.sc)
예제 #6
0
def main():
    """ Program entrypoint, orchestrates the pipeline"""
    args = parse_args()
    tfidf = create_tf_idf(args.input)
    information_extractor = InformationExtractor(args.vectors, read_gazetter(args.brands), read_gazetter(args.styles),
                                                 read_gazetter(args.materials), read_gazetter(args.items),
                                                 read_gazetter(args.probasebrands),
                                                 read_gazetter(args.probasematerials), read_gazetter(args.patterns),
                                                 read_gazetter(args.itemtopcategory), args.deepdetect, args.conf, tfidf)
    sc = pyspark.SparkContext(conf=sparkConf())
    sql = pyspark.SQLContext(sc)
    analyze_user(information_extractor, sql, args)
예제 #7
0
 def __connected_spark_cluster(self, resource_url, pilot_description=None):
     conf = pyspark.SparkConf()
     conf.setAppName("Pilot-Spark")
     if pilot_description != None:
         for i in list(pilot_description.keys()):
             if i.startswith("spark"):
                 conf.set(i, pilot_description[i])
     conf.setMaster(resource_url)
     print((conf.toDebugString()))
     sc = pyspark.SparkContext(conf=conf)
     sqlCtx = pyspark.SQLContext(sc)
     pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx)
     return pilot
예제 #8
0
def _test():
    """
    Unit test function for PairwiseEuclideanDistance class.
    """
    try:
        if isinstance(sc, pyspark.SparkContext):
            print("SparkContext:", sc)
    except Exception as ex:
        sc = pyspark.SparkContext()
        print("Created SparkContext:", sc)
    try:
        if isinstance(sqlContext, pyspark.SQLContext):
            print("SQLContext:", sqlContext)
    except Exception as ex:
        sqlContext = pyspark.SQLContext(sc)
        print("Created SQLContext:", sqlContext)

    # Generate dummy vectors
    rdd_data = [(pyspark.mllib.linalg.Vectors.dense([0.0, 1.0]), ),
                (pyspark.mllib.linalg.Vectors.dense([1.0, 1.0]), ),
                (pyspark.mllib.linalg.Vectors.dense([2.0, 3.0]), ),
                (pyspark.mllib.linalg.Vectors.dense([8.0, 9.0]), )]
    df_data = sqlContext.createDataFrame(rdd_data, ["feature"])
    df_data.show()
    df_data.collect()

    # Instantiate a pairwise Euclidean distance transformer
    ped = PairwiseEuclideanDistance(squared=False,
                                    inputCol="feature",
                                    outputCol="distance")

    # Get squared Euclidean pairwise distances
    ped.setSquared(True)
    ped.getSquared()
    ped.getInputCol()
    ped.hasDefault("squared")
    print(ped.explainParams())
    ped.transform(df_data).show()

    # Get Euclidean pairwise distances
    ped.setSquared(False)
    print(ped.explainParams())
    ped.transform(df_data).show()

    # Shutdown SparkContext
    sc.stop()
    print("SparkContext is shutdown.")
예제 #9
0
    def __init__(self, topic, spark_ip="local[2]", kafka_ip="localhost:9092"):
        self.topic = topic
        self.kafka_ip = kafka_ip
        sc = SparkContext(spark_ip, appName="WikiStream")
        self.spark = pyspark.SQLContext(sc)
        self.df = self.spark \
            .read \
            .format("kafka") \
            .option("kafka.bootstrap.servers", self.kafka_ip) \
            .option("subscribe", self.topic) \
            .option("startingOffsets", "earliest") \
            .load()

        self.events = self.df.withColumn("value", F.col("value").cast(StringType())) \
            .withColumn("value", F.from_json("value", MAIN_SCHEMA)) \
            .select("value.data.*") \
            .withColumn("time", F.col("meta.dt").cast("timestamp"))
예제 #10
0
파일: launcher.py 프로젝트: rgbkrk/spylon
    def sql_context(self, application_name):
        """Create a spark context given the parameters configured in this class.

        The caller is responsible for calling ``.close`` on the resulting spark context

        Parameters
        ----------
        application_name : string

        Returns
        -------
        sc : SparkContext
        """
        sc = self.spark_context(application_name)
        import pyspark
        sqlContext = pyspark.SQLContext(sc)
        return (sc, sqlContext)
 def __init__(self, spark_context, initial_data_path, sep='\\s+', row_limit=None, skip_rows=None):
     if initial_data_path is not None:
         self.sql_context = pyspark.SQLContext(spark_context)
         self.initial_data = self.sql_context.createDataFrame(
             pd.read_csv(
                 initial_data_path,
                 sep=sep,
                 nrows=row_limit,
                 skiprows=skip_rows
             ),
             ['f', 't']
         )
         self.old_data = self.sql_context.createDataFrame(pd.DataFrame([[0, 1]], columns=['f', 't']))
         self.data = self.initial_data
         self.final_output = None
     else:
         raise ValueError("File path shouldn't be None.")
def sqlite2parquet(db_path, output_dir, skip_tables=['sqlite_sequence']):
    if args.spark:

        # load Spark configuration
        conf = pyspark.SparkConf()
        conf.set('spark.executor.memory', '4g')
        #conf.set('spark.sql.parquet.compression.codec', 'gzip')
        # We recommend snappy because it is splittable
        # https://www.cloudera.com/documentation/enterprise/5-3-x/topics/admin_data_compression_performance.html
        conf.set('spark.sql.parquet.compression.codec', 'snappy')
        sc = pyspark.SparkContext("local", conf=conf)

    conn = sqlite3.connect(db_path)
    tables = get_table_list(conn)
    for table in tables:
        if table in skip_tables:
            print("Skipping: {0}".format(table))
            continue
        print("Converting: {0}".format(table))
        logging.info("Converting: {0}".format(table))
        gen    = get_generator_from_table(conn, table)
        schema = get_column_names_from_table(conn, table)
        print("schema: ",schema)

        if args.spark:
            print("converting to data-frame")
            print("column names: {}".format(schema))

            a = sc.parallelize(gen)
            a.persist(StorageLevel.DISK_ONLY)
            sqlContext = pyspark.SQLContext(sc)
            df = sqlContext.createDataFrame(a, schema=schema, samplingRatio=None)
            fname = os.path.join(output_dir, table + '.parquet')
            print("\t saving...")
            df.saveAsParquetFile(fname)
        else:
            print("Running --no-spark")
            print("Here are the first five rows:")
            i = 0
            for row in gen:
                print("#{}: ".format(i),end=" ")
                for x in row:
                    print(x,end=" ")
                print()
                i += 1
                if i>5: break
예제 #13
0
def create_spark_context(aws_conn_id):
    """creates the spark session"""
    spark = (SparkSession.builder.config(
        "spark.jars.packages",
        "org.apache.hadoop:hadoop-aws:2.7.0").getOrCreate())
    spark.sparkContext.setLogLevel("INFO")

    sc = spark.sparkContext
    sc = pyspark.SQLContext(sc)

    aws_hook = AwsHook(aws_conn_id)
    credentials = aws_hook.get_credentials()
    spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key",
                                                      credentials.access_key)
    spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key",
                                                      credentials.secret_key)

    return sc
예제 #14
0
def main():

    # Path to Parquet files
    parquet_dir = './data'

    # Create test data
    table_list = ['table1', 'table2']
    column_list = [['a', 'b', 'c'], ['d', 'e', 'f']]

    for table, columns in zip(table_list, column_list):
        save_table(
            pd.DataFrame(np.random.randint(low=0, high=10, size=(5, 3)),
                         columns=columns),
            os.path.join(parquet_dir, 'db.sqlite'), table,
            lambda name: create_engine('sqlite:///' + name))

    # Set up Spark context
    conf = pyspark.SparkConf()

    args = (
        ('spark.executor.memory', '4g'),
        ('spark.sql.parquet.compression.codec', 'gzip'),
        # ('spark.sql.parquet.compression.codec', 'snappy')
    )

    map(lambda args: conf.set(*args), args)

    spark_context = pyspark.SparkContext("local", conf=conf)
    sql_context = pyspark.SQLContext(spark_context)

    # Save Parquet file
    save_to_parquet(spark_context, sql_context,
                    create_sqlite(os.path.join(parquet_dir, 'db.sqlite')),
                    table_list, parquet_dir)

    # Read Parquet file
    table_df = sql_context.read.parquet(os.path.join(parquet_dir, 'table1'))
    print(table_df.count())
    print(table_df.head())
예제 #15
0
def apply_model(sc, path_eval, output_path="../output/evaluation_rating.csv"):
    """
    Apply the model previously built to the evaluation file
    :param path_eval: csv file path
    :return: None, save the file in an output folder
    """
    features_rating = StructType([
        StructField("userId", FloatType(), True),
        StructField("movieId", FloatType(), True),
    ])

    model_als = ALSModel.load("../als_model")

    df = pyspark.SQLContext(sc).read.format("csv").schema(
        features_rating).option("header", True).load(path_eval)

    predictions = model_als.transform(df)

    predictions.coalesce(1)\
       .write.format("com.databricks.spark.csv")\
       .option("header", "true")\
       .save(output_path)
예제 #16
0
def run(string_content):
    try:
        content, user_input = parse_string_to_numeric(string_content)
        sc = pyspark.SparkContext.getOrCreate()
        sqlContext = pyspark.SQLContext(sc)
        edge_pairs = sc.textFile(LOCAL_DATA_PATH)
        Dk = getUVDFfromUndirectedEdgePairsRDD(sqlContext, edge_pairs,
                                               base_coin_functions)
        Dd = getUVSecondCircleDFfromUndirectedEdgePairsRDD(
            sqlContext, edge_pairs, base_coin_functions)
        res = get_plausible_filtered(sqlContext, Dk, Dd, base_coin_functions)
        writeDBResource(res)
        print(
            "-----------------------------saved into people--------------------------------"
        )

        res = res.filter((col(A_NODE) == content)
                         | (col(B_NODE) == content)).sort(desc(WEIGHT)).take(4)
        results_map = get_results_map(content, user_input, res)
        return results_map
    except Exception as e:
        print("error -------------> \n", e)
        return e.__str__()
예제 #17
0
# In[3]:

# Define the sqlContext
sqlContext = SQLContext(sc)
# Define the hive context
hiveContext = HiveContext(sc)

# Create the spark session.
ss = pyspark.sql.SparkSession(sc)
spark = ss.builder.master("local").appName("Word Count").config(
    "spark.some.config.option", "some-value").getOrCreate()

# Create sqlCtx object.
# CSV are accessed as sql tables using this.
sqlCtx = pyspark.SQLContext(sc)

# ## API: getTime and plotResults
#
# - plotResults is the utility function that will plot the compressed and reconstructed data.

# In[4]:


def getTime(x, dfTest):
    return dfTest.at[int(x), 'timeseries']


def plotResults(dfs, plotTemplates):
    fig, ax = plt.subplots(figsize=(15, 8))
    ax.set_title('compression analysis')
예제 #18
0
import pyspark as spark
import pandas as pd
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, udf, monotonically_increasing_id, unix_timestamp, round, avg
import re

from pyspark.sql import SparkSession

sc = spark.SparkContext()
sql = spark.SQLContext(sc)

sc.setLogLevel("OFF")

#sparkSession = SparkSession.builder.appName("BaseServer").getOrCreate()
#TweetSDF = sparkSession.read.csv('hdfs://localhost/user/team14/tweets.csv')
#print('===========================================================================')
#print('Tweets counts : ', TweetSDF.count())

TweetPD = pd.read_csv('./meta/tweets.csv',
                      error_bad_lines=False,
                      engine='python',
                      header=None)
TweetSDF = sql.createDataFrame(TweetPD)
TweetSDF = TweetSDF.dropna()
print(
    '==========================================================================='
)
print('Tweets counts : ', TweetSDF.count())

#sparkSession = SparkSession.builder.appName("BaseServer").getOrCreate()
예제 #19
0
파일: _collection.py 프로젝트: jshiv/worm
 def to_spark_df(self):
     sc = get_spark_context()
     if sc:
         import pyspark
         return pyspark.SQLContext(sc).createDataFrame(self.to_df())
예제 #20
0
# coding: utf-8
from pyspark import SparkContext as sc
from pyspark.sql.functions import concat_ws
import pyspark
import sys
import requests
import json
# sys.setdefaultencoding() does not exist, here!
reload(sys)  # Reload does the trick!
sys.setdefaultencoding('UTF8')

r = requests.get('http://0.0.0.0:5000/jsonresult')
with open('/root/rezerv/output.json', 'w') as outfile:
    json.dump(r.json(), outfile)

sqlContext = pyspark.SQLContext(pyspark.SparkContext())
df = sqlContext.read.json('file:///root/rezerv/output.json')
df.select('geometry.location.lat', 'geometry.location.lng', 'name', concat_ws(',', 'types'), 'rating', 'vicinity').write.csv('/datastorage/poidata.csv')
예제 #21
0
def sqlcontext():
    sc = pyspark.SparkContext.getOrCreate()
    return (pyspark.SQLContext(sc), sc)
예제 #22
0
# In[ ]:


sc = pyspark.SparkContext()


# In[ ]:


sc


# In[ ]:


sql_sc = pyspark.SQLContext(sc)

Next, we change the column names to better understand what we are dealing with. We only rename the unique columns, we will do the others later.
# In[ ]:


def convert_prices_to_ddf(file):
    res_ddf = (sql_sc
               .read
               .option('header', 'false')
               .option('sep', ';')
               .option('inferSchema', 'true')
               .csv(file)
               .selectExpr(
                 '_c0 as station_id',
                 '_c1 as post_code',
def test_linear_regression():
    """
    Test the linear regression model on the cleaned data
    :param sc: Spark context
    :return: None
    """
    conf = (pyspark.SparkConf().setAppName('test').set(
        "spark.executor.memory", "2g").setMaster("local[2]"))
    sc = pyspark.SparkContext(conf=conf)
    path_rating = "../data/rating_with_movie_data.csv"

    df = pyspark.SQLContext(sc).read.format("csv").option(
        "header", True).load(path_rating)

    columns_to_drop = [
        'timestamp', 'imdbId', 'tmdbId', 'imdb_id', 'release_date'
    ]
    df = df.drop(*columns_to_drop)

    for col_name in df.columns:
        df = df.withColumn(col_name, col(col_name).cast(FloatType()))

    df = df.fillna(0)

    assembler = VectorAssembler(
        inputCols=([x for x in df.columns if x not in ['rating']]),
        outputCol="features")
    pipeline = Pipeline(stages=[assembler])
    pipelineModel = pipeline.fit(df)

    df = pipelineModel.transform(df)

    selected_cols = ['features', 'rating']
    df = df.select(selected_cols)

    (trainingData, testData) = df.randomSplit([0.8, 0.2])

    lr = LinearRegression(featuresCol='features',
                          labelCol='rating',
                          maxIter=10)

    # Chain indexer and forest in a Pipeline
    pipeline = Pipeline(stages=[lr])

    evaluator = RegressionEvaluator(labelCol="rating",
                                    predictionCol="prediction",
                                    metricName="rmse")

    paramGrid = ParamGridBuilder() \
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .build()

    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=RegressionEvaluator(),
                              numFolds=2)

    # Train model.  This also runs the indexer.
    model = crossval.fit(df)

    # Make predictions.
    predictions = model.transform(trainingData)

    predictions.show()

    # Select example rows to display.
    predictions.select("prediction", "rating", "features").show(5)

    # Select (prediction, true label) and compute test error
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
예제 #24
0
def nlp_pipeline(filename, max_length, dict_length, num_executors, dict_file,
                 test_split, num_classes, sc):
    sql = pyspark.SQLContext(sc)
    # Load csv file with raw samples
    print('{} Loading input data'.format(datetime.now()))
    if num_classes == 2:
        csv_data = sql.read.csv(filename, sep="\t", inferSchema=True, header=True) \
                                  .where('star_rating in (1, 2, 4, 5)')
    else:
        csv_data = sql.read.csv(filename, sep="\t", inferSchema=True, header=True) \
                                  .where('star_rating in (1, 2, 3, 4, 5)')

    if test_split == None:
        start_data = csv_data
        test_raw_data = None
    else:
        # Take apart test data from the beginning (if needed)
        (start_data,
         test_raw_data) = csv_data.randomSplit([1 - test_split, test_split],
                                               seed=SEED)

    raw_data = start_data.rdd.cache()

    # Create a new RDD with a tuple of text and label
    data = raw_data.map(lambda sample:
                        (sample.review_body, sample.star_rating))

    # Clean data by removing murkups, urls, emails, etc.
    print('{} Cleaning text'.format(datetime.now()))
    clean_rdd = data.map(lambda sample: clean_data(sample))

    # Split sentences in tokens
    print('{} Splitting text in tokens'.format(datetime.now()))
    words_rdd = clean_rdd.map(lambda sample: tokenizer(sample))

    # Remove stop words and punctuations from sentences
    print('{} Removing stop words and punctuation'.format(datetime.now()))
    clean_words_rdd = words_rdd.map(
        lambda sample: remove_stop_words_and_punc(sample))

    # Get the lemma of each words
    print('{} Lemmatization'.format(datetime.now()))
    lemma_words_rdd = clean_words_rdd.map(lambda sample: lemmatize(sample)) \
             .cache()

    if dict_file == None:
        # Create a dictionary of distinct words and index
        print('{} Creating dictionary'.format(datetime.now()))
        dictionary = create_dictionary(lemma_words_rdd, dict_length)
    else:
        print('{} Loading dictionary'.format(datetime.now()))
        dictionary = load_dictionary(dict_file)

    # Broadcast the dictionary in order to have it in all workers
    dict_broad = sc.broadcast(dictionary)

    # Substitute words with its index
    print('{} Substituting words with indexes'.format(datetime.now()))
    index_rdd = lemma_words_rdd.map(
        lambda sample: replace_word(sample, dict_broad))

    # Remove empty samples
    print('{} Removing empty samples'.format(datetime.now()))
    filtered_rdd = index_rdd.filter(
        lambda sample: np.sum(np.asarray(sample[0], dtype=np.float32)) > 0)

    # Normalize input indexes. Commented because it gets worse
    #print('{} Normalizing data'.format(datetime.now()))
    #norm_rdd = filtered_rdd.map(lambda sample: normalize(sample, dict_length))
    norm_rdd = filtered_rdd

    # Pad or trim samples in order to have the same lenght.
    print('{} Padding arrays'.format(datetime.now()))
    padded_rdd = norm_rdd.map(
        lambda sample: trim_or_pad_samples(sample, max_length))

    # One hot encoding of label
    print('{} One hot encoding labels'.format(datetime.now()))
    if num_classes == 2:
        final_rdd = padded_rdd.map(
            lambda sample: sample[0] + label_OHE_bin[sample[1]])
    else:
        final_rdd = padded_rdd.map(
            lambda sample: sample[0] + label_OHE[sample[1]])

    # Convert de RDD to a Dataframe and fills with 0 null values
    final_df = final_rdd.toDF() \
         .na.fill(0)

    return final_df, dictionary, test_raw_data
예제 #25
0
 def setUpClass(cls):
     conf = pyspark.SparkConf().setMaster("local[2]").setAppName("testing")
     cls.sc = pyspark.SparkContext(conf=conf)
     cls.spark = pyspark.SQLContext(cls.sc)
        # predict new rating
        if item_rate_list[i][0] in neigh_dict.keys():
            sim = neigh_dict[item_rate_list[i][0]]
            numer += (sim * item_rate_list[i][1])
            denom += sim
    if denom > 0:
        pred_rating = numer / denom
    return user, pred_rating


neigh_dict = {}

sc = pyspark.SparkContext.getOrCreate()

given_items = eval(sys.argv[2])
ip = pyspark.SQLContext(sc).read.option("header", "true").json(sys.argv[1])
ip_rdd = ip.select("overall", "reviewerID", "asin").rdd.map(list)
''' 
Filtering - only one rating per user per item
The data is by default in the descending order of review time. Grouping by (item, user) and taking the first rating from 
the list of ratings as that would be the most recent. 
'''
filter_one = ip_rdd.map(lambda x: ((x[2], x[1]), x[0])).combineByKey(
    lambda x: [x], lambda u, v: u + [v], lambda x, y: x + y)
filtered_data = filter_one.map(lambda x: (x[0][0], x[0][1], list(x[1])[0]))
items_users_group = filtered_data.map(
    lambda x: (x[0], (x[1], x[2]))).combineByKey(
        lambda x: [x], lambda u, v: u + [v], lambda x, y: x + y).map(
            lambda x: (x[0], list(x[1]))).filter(lambda x: len(x[1]) >= 25)
filter_items = set(items_users_group.map(lambda x: x[0]).collect())
# The big dataset might yield more than 1000 items or users that is why not broadcasting the set.
예제 #27
0
import multiprocessing

import pyspark as ps
from lsa import LatentSemanticAnalysis
from reviews import get_beer_reviews_dataframe

if __name__ == '__main__':
    cpu = 'local[{}]'.format(multiprocessing.cpu_count())
    sc = ps.SparkContext(cpu)
    print "Just created a SparkContext"
    sql_context = ps.SQLContext(sc)
    print "Just created a SQLContext"
    df = sql_context.read.json('../data/reviews.json')
    print "df"
    df_reviews = df.select('brewery_name', 'beer_name', 'state', 'beer_style',
                           'avg_rating', 'text')
    print "df_reviews"

    try:
        df_beer_reviews = get_beer_reviews_dataframe(df_reviews)
    except:
        pass
    df_beer_reviews = get_beer_reviews_dataframe(df_reviews)
    df_beer_reviews.persist(ps.StorageLevel.MEMORY_AND_DISK)
    print "df_beer_reviews"
    print df_beer_reviews.count()

    lsa_ = LatentSemanticAnalysis(sc, sql_context, df_beer_reviews)
    lsa_.transform(n_components=500)
    # sc.stop()
    path = "test"

    cs.entries.toDF().write.parquet(path)

    cs.entries.toDF().coalesce(1)\
       .write.format("com.databricks.spark.csv")\
       .option("header", "true")\
       .save("testtest.csv")


if __name__ == "__main__":
    path_rating = "../data/ratings.csv"

    conf = (pyspark.SparkConf().setAppName('test').set(
        "spark.executor.memory",
        "2g").setMaster("local[4]")).set('spark.sql.pivotMaxValues', u'50000')
    sc = pyspark.SparkContext(conf=conf)

    features_rating = StructType([
        StructField("userId", FloatType(), True),
        StructField("movieId", FloatType(), True),
        StructField("rating", FloatType(), True),
        StructField("timestamp", StringType(), True)
    ])

    df = pyspark.SQLContext(sc).read.format("csv").schema(
        features_rating).option("header", True).load(path_rating)
    df = df.select("userId", "movieId", "rating")

    compute_similarity(df)
### --- dir structure and params
hdfsPath = params['hdfsPath']
dataDir = params['dataDir']

# - Spark Session
sc = SparkSession\
    .builder\
    .appName("WD Inequality")\
    .enableHiveSupport()\
    .getOrCreate()

# - Spark Session Log Level: INFO
sc.sparkContext.setLogLevel("INFO")

# - SQL context
sqlContext = pyspark.SQLContext(sc)

### --- get wmf.mediawiki_history snapshot
snaps = sqlContext.sql('SHOW PARTITIONS wmf.mediawiki_history')
snaps = snaps.toPandas()
mwwikiSnapshot = snaps.tail(1)['partition'].to_string()
mwwikiSnapshot = mwwikiSnapshot[-7:]
currentMonth = mwwikiSnapshot
currentYear = mwwikiSnapshot[0:4]

### --- Edits distribution: since the beginning of time and until current snapshot
wdri = sqlContext.sql(
    'SELECT event_user_id, event_user_is_bot_by FROM wmf.mediawiki_history WHERE event_entity="revision" AND event_type="create" AND wiki_db="wikidatawiki" AND page_namespace=0 AND snapshot="'
    + mwwikiSnapshot + '"')
wdri = wdri.withColumn("bot_name",
                       array_contains(col("event_user_is_bot_by"), "name"))
예제 #30
0
import numpy as np
import pandas as pd
import pyspark
import logging

FORMAT = '%(asctime)-15s %(clientip)s %(user)-8s %(message)s'
logging.basicConfig(format=FORMAT)

logger = logging.Logger('Main')

try:
    sc = pyspark.SparkContext(appName="Test")
except Exception as err:
    logger.warning(err)
    
sql = pyspark.SQLContext(sc)
    
rdd = sc.parallelize([
    ('A', 1),
    ('B', 5),
    ('A', 6),
    ('B', 9),
    ('A', 2)
])

def func(x):
    key = x[0]
    value = x[1]
    print(value)
    arr1 = [
            ["Movies" , np.array([1.0, 2.5], dtype=np.float32)],