def _calc_model_metrics(df: DataFrame, prob_col: str,
                        label_col: str) -> Dict[str, float]:
    r"""calc model metrics at max f1 given probabilities and labels

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    prob_col : str
        colname w/ raw probabilities of being in class 1
    label_col : str

    Returns
    -------
    max_metrics : dict
        dict with keys : 'tp', 'tn', 'fp', 'fn', 'f1', 'accuracy',
        'precision', 'recall' and corresponding values as floats

    Raises
    ------
    UncaughtException
    """
    _persist_if_unpersisted(df)

    metrics_df = df.groupby(prob_col).pivot(label_col).count().fillna(value=0)
    metrics_df.persist(StorageLevel(False, True, False, False))

    window = Window.orderBy(prob_col).rowsBetween(Window.unboundedPreceding,
                                                  -1)
    metrics_df = metrics_df.withColumn('fn', F.sum(F.col(str(1))).over(window))
    metrics_df = metrics_df.withColumn(
        'tn',
        F.sum(F.col(str(0))).over(window)).fillna(value=0)
    metrics_df.persist(StorageLevel(False, True, False, False, 1))

    all_count = df.count()
    pos_count = df.where(F.col(label_col) == 1).count()
    neg_count = all_count - pos_count

    metrics_df = metrics_df.withColumn('tp', pos_count - F.col('fn'))
    metrics_df = metrics_df.withColumn('fp', neg_count - F.col('tn'))

    metrics_df = metrics_df.withColumn('precision', (F.col('tp')) /
                                       (F.col('tp') + F.col('fp')))
    metrics_df = metrics_df.withColumn('recall', F.col('tp') / pos_count)

    metrics_df = metrics_df.withColumn(
        'informativeness', 2 * (F.col('precision') * F.col('recall')) /
        (F.col('precision') + F.col('recall')))
    metrics_df.persist(StorageLevel(False, True, False, False))

    max_metrics = metrics_df.where(
        F.col('informativeness') == metrics_df.select(
            F.max(F.col('informativeness'))).take(1)[0][0]).take(
                1)[0].asDict()
    max_metrics['accuracy'] = (max_metrics['tp'] +
                               max_metrics['tn']) / all_count
    max_metrics['threshold'] = max_metrics[prob_col]

    return max_metrics
예제 #2
0
def stage3(args):
    processed_path = args.wikidata_path.replace('.json', '-processed.json')
    processed_rdd = sc.textFile(processed_path).map(lambda s: literal_eval(s))
    processed_rdd.persist(StorageLevel(True, True, False, False, 1))

    ## Restore last checkpoint
    start_level = 0
    for i in range(args.level):
        expanded_path = args.wikidata_path.replace(
            '.json.bz2', '-expanded-level{}.txt'.format(i))
        if os.path.exists(expanded_path):
            start_level = i + 1
    if start_level > 0:
        print(
            f'Starting level {start_level} > 0, restoring previous checkpoint')
        expanded_path = args.wikidata_path.replace(
            '.json.bz2', '-expanded-level{}.txt'.format(start_level - 1))
        all_categories = sc.textFile(expanded_path).map(literal_eval)
    else:
        all_categories = processed_rdd\
            .filter(lambda tup: tup[1]['wiki_title'] is not None)\
            .map(lambda tup: (tup[0], (0, {}, tup[1]['instance_of'] + tup[1]['subclass_of'] + tup[1]['occupation'])))

    for i in range(start_level, args.level):
        next_level_categories = all_categories\
            .filter(lambda tup: tup[1][-1] != [])\
            .flatMap(lambda tup: [(category, tup[0]) for category in tup[1][-1]])\
            .join(processed_rdd)\
            .map(lambda tup: (tup[1][0], tup[1][1]['subclass_of']))\
            .reduceByKey(add, 512)
        next_all_categories = all_categories.leftOuterJoin(next_level_categories, 512)\
            .map(update_categories)
        next_all_categories.persist(StorageLevel(True, True, False, False, 1))
        all_categories.unpersist()
        all_categories = next_all_categories
        unfinished_entities = all_categories.filter(
            lambda tup: tup[1][-1] != []).count()
        print(
            'After {} iterations, there are still {} entities that needs to be expanded'
            .format(i + 1, unfinished_entities))
        expanded_path = args.wikidata_path.replace(
            '.json.bz2', '-expanded-level{}.txt'.format(i))
        print(
            'Persisting it onto disk with file name {}'.format(expanded_path))
        all_categories.saveAsTextFile(
            expanded_path, 'org.apache.hadoop.io.compress.BZip2Codec')
        if i > 0:
            old_expanded_path = args.wikidata_path.replace(
                '.json.bz2', '-expanded-level{}.txt'.format(i - 1))
            print(f'Removing old directory {old_expanded_path}')
            rmtree(old_expanded_path)
        if unfinished_entities == 0:
            break
def word_count_plus_pesist_mem_only(internal_param, data_file):
    try:
        conf = SparkConf().setMaster("spark://dana:7077").setAppName(
            internal_param[1]).setAll([
                ('spark.driver.cores', internal_param[2]),
                ('spark.driver.memory', internal_param[3]),
                ('spark.executor.instances', internal_param[4]),
                ('spark.executor.memory', internal_param[5]),
                ('spark.executor.cores', internal_param[6])
            ])
        sc = SparkContext(conf=conf,
                          pyFiles=[
                              'run_app_small.py', 'run_app.py',
                              'sesgo_scripts.py', 'persist_scripts.py',
                              'repartition_scripts.py', 'config_scripts.py',
                              'wordCountConfig.py'
                          ])

        data = sc.textFile(data_file)
        words = data.flatMap(mymapeo)

        frequencies = words.map(lambda x: (x, 1)).reduceByKey(
            lambda a, b: a + b).persist(
                StorageLevel(True, False, False, False,
                             int(internal_param[7])))

        topFreqs = frequencies.sortBy(lambda x: x[1], ascending=False).persist(
            StorageLevel(True, False, False, False, int(internal_param[7])))
        print('Top 5 frequencies:', topFreqs.take(5))

        leastFreqs = frequencies.sortBy(lambda x: x[1], ascending=True)
        print('Leats 5 frequencies:', leastFreqs.take(5))

        topLenFreqs = topFreqs.sortBy(lambda x: len(x[0]),
                                      ascending=False).persist(
                                          StorageLevel(True, False, False,
                                                       False,
                                                       int(internal_param[7])))
        print('Top 5 length:', topLenFreqs.take(5))

        containsAwords = words.filter(lambda x: 'a' in x)
        print('Number of words containing a:', containsAwords.count())

        containsXwords = words.filter(lambda x: 'x' in x)
        print('Number of words containing x:', containsXwords.count())

        app_id = sc.applicationId
        sc.stop()
        return app_id
    except:
        print("Configuration error: " + str(internal_param))
        sc.stop()
예제 #4
0
def _train_spark(data, n_components, n_pc, covar_types, verbose, n_jobs,
                 n_iter_search):
    # Spark configuration.
    conf = (SparkConf().setMaster("local[" + str(n_jobs) +
                                  "]").setAppName("FDD").set(
                                      "spark.executor.memory",
                                      "512mb").set("spark.cores.max",
                                                   str(n_jobs)))
    sc = SparkContext(conf=conf)
    # Build hyperparameter vectors.
    parameters = cartesian((n_components, n_pc, covar_types))
    # Distribute the hyperparameters vector.
    parameters_rdd = sc.parallelize(parameters, 96)
    # Broadcast the data to all workers.
    data_broadcast = sc.broadcast(data)
    # Train a model for each hyperparameter set.
    models = parameters_rdd.map(
        lambda param: train_with_parameters(param, data_broadcast))
    # Persist the models the avoid re-computation.
    models.persist(StorageLevel(True, True, False, True, 1))
    # Sort by BIC.
    sorted_models = models.sortBy(lambda model: model[0])
    # The first is the best model.
    best_model = sorted_models.collect()[0][1]
    sc.stop()
    return best_model
예제 #5
0
def main(input_file, output_file):
    sc = SparkContext('local[*]', 'srinagapavanisirichandana_pisupati_task1')

    textRDD = sc.textFile(input_file).repartition(2)

    reviews = textRDD.map(parse_review).persist(StorageLevel(True, True, False, False, 1))

    review_counts_2018 = reviews.filter(lambda r: is_year(r, '2018'))

    distinct_users = reviews.map(lambda r: r["user_id"]).distinct()

    top_users = reviews.map(lambda review: (review["user_id"], 1)).reduceByKey(lambda a, b: a + b) \
        .sortBy(lambda x: (-1 * x[1], x[0]),numPartitions=1).take(10)

    distinct_businesses = reviews.map(lambda r: r["business_id"]).distinct()

    top_businesses = reviews.map(lambda review: (review["business_id"], 1)).reduceByKey(lambda a, b: a + b) \
        .sortBy(lambda x: (-1 * x[1], x[0]),numPartitions=1).take(10)

    result = {"n_review": reviews.count(),
              "n_review_2018": review_counts_2018.count(),
              "n_user": distinct_users.count(),
              "top10_user": top_users,
              "n_business": distinct_businesses.count(),
              "top10_business": top_businesses
              }

    fp = open(output_file, "w")
    fp.write(json.dumps(result))
    fp.close()
예제 #6
0
def test(allHex,hashFiles,sc,sqlc,path,featureFitModel):

    bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes")
    def fun(accum,x):

        return accum+','+x

    bytesFileString = bytesFiles.reduce(fun)
    rdd1= sc.wholeTextFiles(bytesFileString,20)

    bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [str(int(word,16)) for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0]))
    Vec= bytesRdd.map(lambda x: (x[0],createVector(x[1])))
    sparseVec = Vec.map(lambda x: (x[0],SparseVector(256,numpy.nonzero(x[1])[0],x[1][x[1]>0])))

    ngramFrame = sqlc.createDataFrame(sparseVec,["did","1grams"])

    twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams")
    ngramFrame = twoGram.transform(ngramFrame)

    featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF()

    featuresCV = featureFitModel.transform(ngramFrame)

    testData = featuresCV.drop('docFeatures')
    testData.persist(StorageLevel(True, True, False, False, 1))
    saveData(ngramFrame,path)
    testData.show()
예제 #7
0
def train(allHex,labels,hashFiles,sc,sqlc,path):

    bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes")

    def fun(accum,x):
        return accum+','+x

    bytesFileString = bytesFiles.reduce(fun)
    rdd1= sc.wholeTextFiles(bytesFileString,20)

    bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [word for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0]))

    ngramFrame = sqlc.createDataFrame(bytesRdd,["did","1grams"])

    twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams")
    ngramFrame = twoGram.transform(ngramFrame)

    featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF()

    cv = CountVectorizer(inputCol="docFeatures", outputCol="features",vocabSize=1000)

    featureFitModel = cv.fit(ngramFrame)

    featuresCV = featureFitModel.transform(ngramFrame)

    labelRdd = labels.zipWithIndex().map(lambda x: (x[1],x[0]))

    labelFrame = labelRdd.toDF(["did","label"])

    trainData = ngramFrame.featuresCV(labelFrame,"did")
    trainData.persist(StorageLevel(True, True, False, False, 1))
    saveData(trainData,path)

    trainData.show()
    returm featureFitModel
def word_count_persist_disk_only(internal_param, data_file):
    try:
        conf = SparkConf().setMaster("spark://dana:7077").setAppName(
            internal_param[1]).setAll([
                ('spark.driver.cores', internal_param[2]),
                ('spark.driver.memory', internal_param[3]),
                ('spark.executor.instances', internal_param[4]),
                ('spark.executor.memory', internal_param[5]),
                ('spark.executor.cores', internal_param[6])
            ])
        sc = SparkContext(conf=conf,
                          pyFiles=[
                              'run_app_small.py', 'run_app.py',
                              'sesgo_scripts.py', 'persist_scripts.py',
                              'repartition_scripts.py', 'config_scripts.py',
                              'wordCountConfig.py'
                          ])

        data = sc.textFile(data_file)
        words = data.flatMap(mymapeo).persist(
            StorageLevel(False, True, False, False, int(internal_param[7])))

        frequencies = words.map(lambda x: (x, 1)).reduceByKey(
            lambda a, b: a + b)
        frequencies.collect()
        print(frequencies.take(5))

        app_id = sc.applicationId
        sc.stop()
        return app_id
    except:
        print("Configuration error: " + str(internal_param))
        sc.stop()
예제 #9
0
def main():

    # Logistic regression or Random forest
    output_type = sys.argv[1]

    raw_file_path = sys.argv[2]

    path_to_output = sys.argv[3]

    raw_input_rdd = sc.textFile(
        raw_file_path, minPartitions=32).map(lambda line: line.encode("utf-8"))

    process_data = raw_input_rdd.map(lambda line: replacetab(line))

    df_for_pp = create_df(process_data)

    pp = PreProcess(df_for_pp)

    preprocessed_data = pp.preprocess_data()

    data = None

    if output_type == "rf":
        data = prep_rf(preprocessed_data)

    # write to file the data variable
    data.persist(StorageLevel(True, True, False, False, 1))
    data.write.parquet(path_to_output + "final_" + output_type +
                       "_data.parquet")
예제 #10
0
def main():
    # Initialize Spark
    configuration = SparkConf().setAppName("1000-genomes Project")
    spark_context = SparkContext(conf=configuration)

    container_name = "1000-genomes-dataset"
    config = {'user':os.environ['OS_USERNAME'], 
          'key':os.environ['OS_PASSWORD'],
          'tenant_name':os.environ['OS_TENANT_NAME'],
          'authurl':os.environ['OS_AUTH_URL']}

    # Connect to Object-Storage to retrieve filenames
    conn = swiftclient.client.Connection(auth_version=3, **config)
    (storage_url, auth_token) = conn.get_auth()
    (response, content) = swiftclient.client.get_container(url=storage_url,container=container_name, token=auth_token)
    
    num_files_to_include = 3
    names = filter(lambda t: t['name'][-4:] == '.bam', content)   
    filelist = [{"name" : c['name'].strip(), "hash" : c['hash'].strip()} for c in names[:num_files_to_include]]
    numpart = len(filelist)/2

    # Distribute the processing of the files
    filenames = spark_context.parallelize(filelist, numpart)
    mapped_data = filenames.flatMap(process).persist(StorageLevel(True, True, False, True, 1))

    # Filter the different results returned from the processing 
    start_time_filtering = time.time()
    kmers = mapped_data.filter(lambda (k, (v, e)): k == "KMER").map(lambda (k, v): v).reduceByKey(add,numPartitions=5)
    positions = mapped_data.filter(lambda (k,v): k == "POSITION").map(lambda (k, v): v).reduceByKey(add,numPartitions=5) 

    time_filtering = time.time() - start_time_filtering
    time_mapping = mapped_data.filter(lambda (k,v): k == "TIME-MAPPING").map(lambda (k, v): v).reduceByKey(add,numPartitions=5)
    time_download = mapped_data.filter(lambda (k,v): k == "TIME-DOWNLOAD").map(lambda (k, v): v).reduceByKey(add,numPartitions=5)

    time_mapping = time_mapping.collect()
    time_download = time_download.collect() 

    # Write results to files
    timing_file = open("timing.txt", "w")
    timing_file.write("Mapping " + str(time_mapping) + "\n")
    timing_file.write("Mapping+Downloading " + str(time_download) + "\n")
    timing_file.write("Filtering " + str(time_filtering) + "\n")
    timing_file.close()

    kmer_file = open("kmers.txt", "w")
    for item in kmers.collect():
        print>>kmer_file, item
    kmer_file.close()

    pos_file = open("positions.txt", "w")
    for item in positions.collect():
        print>>pos_file, item
    pos_file.close()

    for obj in [open(f, "r") for f in ["kmers.txt", "positions.txt"]]:
        swiftclient.client.put_object(url=storage_url, token=auth_token, container="result", name="group_11/" + str(start_time_filtering) + "_" + obj.name, contents=obj)
        obj.close()

    return
예제 #11
0
 def __init__(self, outfile):
     self.spark = SparkSession.builder.appName(
         "sparkUserAndItemData").enableHiveSupport().getOrCreate()
     # self.spark.conf.set("spark.sql.execution.arrow.enabled", "true")
     # self.spark.conf.set("spark.sql.crossJoin.enabled", "true")
     self.out_files = outfile
     self.age_dict = {i: str(i + 1) for i in range(100)}
     self.workid_dict = {i: str(i + 1) for i in range(100)}
     self.sex_dict = {'f': '1', 'm': '2', '-1': '0'}
     self.height_dict = {i: str(i - 99) for i in range(100, 227)}
     # 添加有效类别的字典
     # self.user_data = self.spark.sql(UserSql).persist(StorageLevel(True, True, False, False, 1))
     # DISK_ONLY = StorageLevel(True, False, False, False, 1)
     # DISK_AND_ME
     self.user_data, self.item_data = self.feature_convert_id()
     self.user_data.persist(StorageLevel(True, True, False, False, 1))
     self.item_data.persist(StorageLevel(True, False, False, False, 1))
def main(input_file_review, input_file_business, output_file_a, output_file_b):
    sc = SparkContext('local[*]', 'srinagapavanisirichandana_pisupati_task3')

    review_rdd = sc.textFile(input_file_review).repartition(2)
    business_rdd = sc.textFile(input_file_business).repartition(2)

    parsed_reviews = review_rdd.map(parse_review)
    parsed_businesses = business_rdd.map(parse_business)

    city_stars = parsed_businesses.leftOuterJoin(parsed_reviews).map(lambda x: x[1]) \
        .filter(lambda x: x[1] is not None) \
        .combineByKey(
        lambda v: (v, 1)
        , lambda x, c: (x[0] + c, x[1] + 1)
        , lambda x, y: (x[0] + y[0], x[1] + y[1])) \
        .map(lambda bac: (bac[0], round(bac[1][0] / bac[1][1], 1))) \
        .sortBy(lambda x: (-1 * x[1], x[0].lower()), numPartitions=1) \
        .map(lambda b: b[0] + ',' + str(b[1])).persist(StorageLevel(True, True, False, False, 1))

    fp = open(output_file_a, "w")
    fp.write('city,stars\n')
    ress = city_stars.collect()

    # city_stars.map(lambda b: b[0] + ',' + str(b[1])).saveAsTextFile(output_file_a)
    for r in ress:
        fp.write(r + '\n')
    fp.close()

    collect_init_seconds = time.time()
    print('city,stars')
    collect_stars = (city_stars.collect())[:10]
    for r in collect_stars:
        print(r)
    m1 = time.time() - collect_init_seconds

    take_init_seconds = time.time()
    print('city,stars')
    take_stars = city_stars.take(10)
    for r in take_stars:
        print(r)
    m2 = time.time() - take_init_seconds

    if m1 > m2:
        explanation = "Method 2 i.e. taking the first 10 cities is faster than Method 1"
    else:
        explanation = "Method 1 i.e.collecting all the data and printing the first 10 cities is faster than Method 2"

    result = {
        "m1": round(m1, 1),
        "m2": round(m2, 1),
        "explanation": explanation
    }
    fp = open(output_file_b, "w")
    fp.write(json.dumps(result))
    fp.close()
def word_count_sort_pesist_mem_and_disk_ser(internal_param, data_file):
    try:
        conf = SparkConf().setMaster("spark://dana:7077").setAppName(
            internal_param[1]).setAll([
                ('spark.driver.cores', internal_param[2]),
                ('spark.driver.memory', internal_param[3]),
                ('spark.executor.instances', internal_param[4]),
                ('spark.executor.memory', internal_param[5]),
                ('spark.executor.cores', internal_param[6])
            ])
        sc = SparkContext(conf=conf,
                          pyFiles=[
                              'run_app_small.py', 'run_app.py',
                              'sesgo_scripts.py', 'persist_scripts.py',
                              'repartition_scripts.py', 'config_scripts.py',
                              'wordCountConfig.py'
                          ])

        data = sc.textFile(data_file)
        words = data.flatMap(mymapeo).persist(
            StorageLevel(True, True, False, False, int(internal_param[7])))

        frequencies = words.map(lambda x: (x, 1)).reduceByKey(
            lambda a, b: a + b).persist(
                StorageLevel(True, True, False, False, int(internal_param[7])))
        numWords = data.count()
        sortFreq = frequencies.sortBy(lambda x: x[1], ascending=False)
        topFreqs = sortFreq.take(5)

        print('Number of words: ', numWords)
        print('Words frequencies:', sortFreq.collect())
        print('Top 5 frequencies:', topFreqs)

        app_id = sc.applicationId
        sc.stop()
        return app_id
    except:
        print("Configuration error: " + str(internal_param))
        sc.stop()
예제 #14
0
def test(opcodes, hashFiles, sc, sqlc, path, featureFitModel):

    asmFiles = hashFiles.map(
        lambda x: "gs://uga-dsp/project2/data/asm/" + x + ".asm")

    def fun(accum, x):
        return accum + ',' + x

    asmFileString = asmFiles.reduce(fun)

    rdd1 = sc.wholeTextFiles(asmFileString, 20)

    opcodesInDoc = rdd1.map(lambda x: x[1].split()).map(
        lambda x: [word for word in x if word in opcodes.value]).zipWithIndex(
        ).map(lambda x: (x[1], x[0]))

    ngramFrame = sqlc.createDataFrame(opcodesInDoc, ["docId", "opcodes"])

    twoGram = NGram(n=2, inputCol="opcodes", outputCol="2grams")
    ngramFrame = twoGram.transform(ngramFrame)

    threeGram = NGram(n=3, inputCol="opcodes", outputCol="3grams")
    ngramFrame = threeGram.transform(ngramFrame)

    fourGram = NGram(n=4, inputCol="opcodes", outputCol="4grams")
    ngramFrame = fourGram.transform(ngramFrame)

    def getSegment(x):
        templist = []
        for line in x:
            l = re.findall(r'\w+:?(?=:)', line)
            if l:
                templist.append(l[0])
        return templist

    segments = rdd1.zipWithIndex().map(lambda x: (x[1], x[0][1].splitlines(
    ))).map(lambda x: (x[0], getSegment(x[1]))).toDF(["docId", "segments"])

    featureFrame = ngramFrame.join(segments, "docId")

    featuresDF = featureFrame.rdd.map(
        lambda x: Row(did=x['docId'],
                      docFeatures=x['opcodes'] + x['2grams'] + x['3grams'] + x[
                          '4grams'] + x['segments'])).toDF()

    featuresCV = featureFitModel.transform(featuresDF)

    testData = featuresCV.drop('docFeatures')
    testData.persist(StorageLevel(True, True, False, False, 1))
    saveData(testData, path)
    testData.show()
예제 #15
0
def analyse(path):

    # read file
    df = spark.read.option('header', True) \
        .option('inferSchema', True) \
        .csv(path) \
        .persist(StorageLevel(True, True, False, False))

    df.createOrReplaceTempView("survey_results")

    # functions
    get_developer_os()
    get_developer_type()
    get_contrib_open_source(df)
    get_coding_as_hobby(df)
예제 #16
0
def generate_fake_news(data, num_entries, min_length, max_length):
    from pyspark import StorageLevel
    modd_data = data.map(lambda x: x + " SENTENCE_END")
    mrkv_pairs = modd_data.map(lambda x: x.split())\
                          .map(lambda x: [x[0].capitalize()] + x[1:])\
                          .flatMap(lambda x: [(x[i], x[i + 1]) for i in range(len(x) - 1)])\
                          .cache()
    revrsed_mrkv_pairs = mrkv_pairs.map(lambda x: (x[1], x[0]))\
                                   .partitionBy(data.getNumPartitions())\
                                   .cache()
    cur_matrix = mrkv_pairs.persist(StorageLevel(True, True, False, False, 1))
    cur_matrix = cur_matrix.map(lambda x: (x[0], (x[1], [x[0], x[1]]|))).persist() 
    for i in range(max_length):
        cur_matrix = revrsed_mrkv_pairs.join(revrsed_mrkv_pairs)\
                                       .map(lambda x: (x[1][0], (x[0], x[1][0] + x[1][1][1]))).persist()
    print(cur_matrix.take(1))
    news_entries = []

    for _ in range(num_entries):
        first_element = mrkv_pairs.filter(lambda x: x[0] != "SENTENCE_END" and \
                                          x[1] != "SENTENCE_END" and x[0][0].isupper())\
                                  .takeSample(False, 1)[0]
        fake_news = first_element[0]
        cur_wrd = first_element[1]

        for i in range(max_length):
            cur_pool = mrkv_pairs.filter(lambda x: x[0] == cur_wrd).cache()
            if i < min_length:
                cur_pool = cur_pool.filter(lambda x: x[1] != "SENTENCE_END").cache()
                if cur_pool.count() == 0:
                    i = 0
                    fake_news = first_element[0]
                    cur_wrd = first_element[1]
                    continue

            cur_pair = cur_pool.takeSample(False, 1)[0]
            fake_news += (" " + cur_pair[0])
            cur_wrd = cur_pair[1]
            if cur_wrd == "SENTENCE_END":
                fake_news += "."
                break
        news_entries.append(fake_news)
    return news_entries
def main(train_file, model_file, stopwords_file):
    SparkContext.setSystemProperty('spark.executor.memory', '4g')
    SparkContext.setSystemProperty('spark.driver.memory', '4g')
    sc = SparkContext.getOrCreate()
    start = time.time()
    stopwords = {s for s in sc.textFile(stopwords_file).collect()}

    reviews = (sc.textFile(train_file).map(
        json.loads).map(lambda d: tokenize(d, stopwords)).persist(
            StorageLevel(True, True, False, False)))

    n = reviews.count()

    #calculating number of documents the term appears in
    dfs = (reviews.flatMap(lambda d: d["tokens"]).map(
        lambda t: (t, 1)).reduceByKey(add).collectAsMap())

    idfs = {k: math.log(n / v) for k, v in dfs.items()}

    def add_key_prefix(rdd, prefix):
        return rdd.map(lambda x: ("{}_{}".format(prefix, x[0]), x[1]))

    business_profiles = build_profile(reviews, "business_id", idfs)

    user_profiles = (reviews.map(lambda d: (d["business_id"], d["user_id"])).
                     join(business_profiles).values().aggregateByKey({}, merge,
                                                                     merge))

    add_key_prefix(user_profiles, "u").union(
        add_key_prefix(business_profiles,
                       "b")).map(json.dumps).saveAsTextFile('task2_model')

    with open(model_file, 'wb') as outfile:
        for filename in glob.glob('task2_model/part*'):
            with open(filename, 'rb') as readfile:
                shutil.copyfileobj(readfile, outfile)

    shutil.rmtree('task2_model')

    print("Duration:", time.time() - start)
def word_count_sort_pesist_mem_only(internal_param, data_file):
    try:
        conf = SparkConf().setMaster("spark://dana:7077").setAppName(
            internal_param[1]).setAll([
                ('spark.driver.cores', internal_param[2]),
                ('spark.driver.memory', internal_param[3]),
                ('spark.executor.instances', internal_param[4]),
                ('spark.executor.memory', internal_param[5]),
                ('spark.executor.cores', internal_param[6])
            ])

        sc = SparkContext(conf=conf,
                          pyFiles=['run_app.py', 'config_scriptsDf.py'])
        spark = SparkSession.builder.config(conf=conf).getOrCreate()

        data = sc.textFile(data_file).flatMap(lambda x: x.split(" ")).collect()
        paralData = sc.parallelize(data, 200)

        df = paralData.map(lambda r: Row(r)).toDF(["word"])
        cleanDf = df.filter(col('word') != '').withColumn(
            'word', regexp_replace(col('word'), '[^\sa-zA-Z0-9]', '')).persist(
                StorageLevel(True, False, False, False,
                             int(internal_param[7])))
        freqDf = cleanDf.withColumn(
            'count', lit(1)).groupBy('word').sum('count').withColumnRenamed(
                'sum(count)', 'frequencies')

        topFreqsDf = freqDf.orderBy('frequencies').limit(5)

        print('Number of words: ', cleanDf.count())
        print('Top 5 frequencies:')
        topFreqsDf.show()

        app_id = sc.applicationId
        sc.stop()
        return app_id
    except:
        print("Configuration error: " + str(internal_param))
        sc.stop()
예제 #19
0
def main():
    spark = SparkSession \
       .builder \
       .appName("RandomForest") \
       .config("spark.executor.heartbeatInterval","60s")\
       .getOrCreate()

    sc = spark.sparkContext
    sqlContext = SQLContext(sc)

    sc.setLogLevel("INFO")

    train_df = spark.read.parquet(sys.argv[1])
    #Persist the data in memory and disk
    train_df.persist(StorageLevel(True, True, False, False, 1))

    rfc = RandomForestClassifier(maxDepth=8,
                                 maxBins=2400000,
                                 numTrees=128,
                                 impurity="gini")
    rfc_model = rfc.fit(train_df)
    rfc_model.save(sys.argv[2] + "rfc_model")
예제 #20
0
    pan = None
print(time.clock()-start)
			
# I converted csv to a parquet file to save space and time

def lis(x):
    return [float(i) for i in x[1:-1].split(',')]

spark.read.load("train_fet.csv", format="csv", inferSchema="true", header="true").rdd \
          .map(lambda x: (x[2], x[1], DenseVector(lis(x[0])))) \
          .toDF(["index", "file", "features"])
          .write.parquet("train_fet.parquet")

# Now I create the Bag of Visual Words representation using K-means

schema = spark.read.parquet("train_fet.parquet").persist(StorageLevel(True, True, False, False, 1))
start = time.clock()
kmeans = KMeans(k=K, initMode='random')
print(time.clock()-start)
start = time.clock()
model = kmeans.fit(schema)
print(time.clock()-start)
start = time.clock()
centers = model.clusterCenters()
print(time.clock()-start)
model.save('KmeansModel')

# Next I create the Hamming Embedding Matrix

G = np.random.randn(db, d)
P, _ = np.linalg.qr(G)
def _eval_df_model(
        df: DataFrame,
        prob_mod: pyspark.ml.Model,
        sample_size: Optional[int] = 10**5,
        additional_metrics: bool = False
) -> propensity_model_performance_summary:
    r"""calculate binary classification model metrics on provided dataframe

    Calculate accuracy, precision, and recall at maximum value for
    informativeness (f1) also provide auc and auprc(precision-recall curve)


    Parameters
    ----------
    df : pyspark.sql.DataFrame
        Array_like means all those objects -- lists, nested lists, etc. --
        that can be converted to an array.  We can also refer to
        variables like `var1`.
    prob_mod : pyspark.ml.Model
        The type above can either refer to an actual Python type
        (e.g. ``int``), or describe the type of the variable in more
        detail, e.g. ``(N,) ndarray`` or ``array_like``.
    sample_size
    additional_metrics


    Returns
    -------
    prob_mod_perf_sum : propensity_model_performance_summary

    Other Parameters
    ----------------
    sample_size: int
        max sample size to calculate performance. Defaults to 10**6, can be
        left as None to avoid sampling

    Notes
    -----
    propensity_model_performance_summary : namedtuple
        'auc' : float
        'auprc' : float
            area under precision recall curve
        'threshold' : float
        'informativeness' (f1) : float
        'precision' : float
        'recall' : float
        'accuracy'  : float

    Raises
    ------
    UncaughtExceptions

    See Also
    --------
    _calc_auc_auprc
    _calc_model_metrics
    """
    sample_df = _sample_df(df=df, sample_size=sample_size)
    _persist_if_unpersisted(sample_df)

    label_col = prob_mod.getOrDefault('labelCol')
    prob_col = prob_mod.getOrDefault('probabilityCol')
    features_col = prob_mod.getOrDefault('featuresCol')
    sample_df = prob_mod.transform(sample_df.select(features_col, label_col))

    prob_1_col = "{prob_col}_1".format(prob_col=prob_col)
    sample_df = sample_df.withColumn(
        prob_1_col,
        F.udf(lambda x: float(x[1]), T.DoubleType())(F.col(prob_col)))
    sample_df.persist(StorageLevel(False, True, False, False, 1))

    auc, auprc = _calc_auc_auprc(df=sample_df,
                                 prob_col=prob_1_col,
                                 label_col=label_col)

    metric_keys = [
        'threshold', 'informativeness', 'precision', 'recall', 'accuracy'
    ]
    if additional_metrics:
        metrics_dict = _calc_model_metrics(df=sample_df,
                                           prob_col=prob_1_col,
                                           label_col=label_col)
        metrics_dict = {
            x: metrics_dict[x]
            for x in metrics_dict.keys() if x in metric_keys
        }
    else:
        metrics_dict = {x: None for x in metric_keys}

    prob_mod_per_sum = propensity_model_performance_summary(auc=auc,
                                                            auprc=auprc,
                                                            **metrics_dict)

    return prob_mod_per_sum
예제 #22
0
    shingles = []
    # shingle = list(image[:k, :k])
    for i in range(image.shape[0] - k):
        for j in range(image.shape[1] - k):
            red_img = image[i:i + k, j:j + k, :].flatten()
            shingles.append(tuple(red_img))  # , :].flatten()))
    # shingles.append(tuple(shingle))

    return (name, shingles)


rdd2 = rdd1.map(map1)

from pyspark import StorageLevel

DISK_ONLY = StorageLevel(True, False, False, False, 1)

rdd2.persist(DISK_ONLY)

rdd3 = rdd2.flatMap(lambda r: r[1])

k_shingle_space = rdd3.distinct().collect()

N = len(k_shingle_space)

print(N)


def map2(tuple1):
    name, image = tuple1
    #bool_vec = bitarray(N)
예제 #23
0
import time

# Slight modification to the baseline approach where a base layer is pre-materialized and
# instead of starting the CNN inference all the way from raw images start from the materialized
# base layer.
if __name__ == '__main__':
    ############################change appropriately###################################
    model = 'alexnet'
    pre_mat_layer_index = -4  # from the top
    explore_layer_index = -1  # from the top
    struct_input = 'hdfs://spark-cluster-master:9000/foods.csv'
    pre_mat_input = 'hdfs://spark-cluster-master:9000/' + model + "_pre_mat_layer.parquet"
    heap_memory = 29
    num_executors = 1
    executor_cpu = 5
    storage_level = StorageLevel(True, True, False, True)  # memory and disk deserialized
    sp_core_memory_fraction = 0.6
    ###################################################################################
    prev_time = time.time()
    if model == 'vgg16':
        initial_shape = VGG16.transfer_layers_shapes[pre_mat_layer_index]
    elif model == 'resnet50':
        initial_shape = ResNet50.transfer_layers_shapes[pre_mat_layer_index]
    elif model == 'alexnet':
        initial_shape = AlexNet.transfer_layers_shapes[pre_mat_layer_index]

    conf = SparkConf()
    app_name = 'pre-mat-' + model + "-l:" + str(pre_mat_layer_index)
    conf.setAppName(app_name)
    conf.set("spark.executor.memory", str(heap_memory)+"g")
    conf.set("spark.memory.fraction", sp_core_memory_fraction)
예제 #24
0
def create_context(host, port):
    """
    Function to create and set up a new streaming context.
    :param host: A string describing the IP to use for HTTP Client. Example: 'localhost'
    :param port: An integer value that corresponds to the port number used to read data from.
    :return: The created and configured streaming context.
    """
    spark_context = SparkContext(master="local[2]", appName="TwitterStreamApp")
    spark_context.setLogLevel("ERROR")
    # create the Streaming Context from the above spark context with interval size 2 seconds
    streaming_context = StreamingContext(spark_context, 2)
    # setting a checkpoint to allow RDD recovery
    streaming_context.checkpoint(CHECKPOINT)
    # read data from port 9009
    data_stream = streaming_context.socketTextStream(ADDRESS,
                                                     PORT,
                                                     storageLevel=StorageLevel(
                                                         True, True, False,
                                                         False, 2))
    # split each tweet into words
    words = data_stream.flatMap(lambda line: line.split(" "))
    # filter the words to get only hashtags, then map each hashtag to be a pair of (hashtag,1)
    hashtags = words.filter(lambda w: '#' in w).map(lambda x: (x, 1))
    # adding the count of each hashtag to its last count
    tags_totals = hashtags.updateStateByKey(aggregate_tags_count)

    def process_rdd(time, rdd):
        """
        Function that processes an RDD.
        :param time: Time stamp of the process.
        :param rdd: The RDD to be processed.
        """
        print("----------- %s -----------" % str(time))
        try:
            if rdd:
                # Get spark sql singleton context from the current context
                sql_context = get_sql_context_instance(rdd.context.getConf())
                # convert the RDD to Row RDD
                row_rdd = rdd.map(
                    lambda w: Row(hashtag=w[0], hashtag_count=w[1]))
                # create a DF from the Row RDD
                hashtags_dataframe = sql_context.createDataFrame(row_rdd)
                # Register the dataframe as table
                hashtags_dataframe.createOrReplaceTempView("hashtags")
                # get the top 10 hashtags from the table using SQL and print them
                hashtag_counts_dataframe = sql_context.sql(
                    "select hashtag, hashtag_count from hashtags order by hashtag_count desc limit 10"
                )
                hashtag_counts_dataframe.show()

                # call this method to prepare top 10 hashtags DF and send them

                def send_dataframe_to_dashboard(dataframe):
                    """
                    Function to send DataFrame to the dashboard for visualization.
                    :param dataframe: Spark DataFrame created by process_rdd().
                    """
                    # extract the hashtags from dataframe and convert them into array
                    top_tags = [
                        str(t.hashtag)
                        for t in dataframe.select("hashtag").collect()
                    ]
                    # extract the counts from dataframe and convert them into array
                    tags_count = [
                        p.hashtag_count
                        for p in dataframe.select("hashtag_count").collect()
                    ]
                    # initialize and send the data through REST API
                    request_data = {
                        'label': str(top_tags),
                        'data': str(tags_count)
                    }
                    response = post(dashboard_url, data=request_data)

                send_dataframe_to_dashboard(hashtag_counts_dataframe)
        except:
            pass

    # do processing for each RDD generated in each interval
    tags_totals.foreachRDD(process_rdd)
    return streaming_context
# Set spark configuration and spark context
conf = SparkConf().setAppName("part3").setMaster("local")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

# Read the input file as RDD
lines = sc.textFile(input_file)

# Read the input file as RDD
#  Here we followed the RDD paper implementation closely
# we first filter the data with "#", then split each row to get keys and values.
# Then we did the persistence (cache setting) for the data.
lines = sc.textFile(input_file).filter(lambda line: '#' not in line).map(
    lambda line: line.split("\t", 1)).persist(
        storageLevel=StorageLevel(False, True, False, False, 1))

# group by key operation
node_map = lines.groupByKey().partitionBy(partition_num)

# generate initial rank
rank = node_map.mapValues(lambda e: 1.0).partitionBy(partition_num)

# Run ten iterations with each recalculating a rank for every node based on PageRank # algorithm
#   Contribution is calculated following https://en.wikipedia.org/wiki/PageRank and original RDD paper
for i in range(10):
    contribution = node_map.join(rank).flatMap(lambda rn: [
        (to_id, rn[1][1] / rn[1][0].__len__()) for to_id in rn[1][0]
    ]).reduceByKey(lambda rn1, rn2: rn1 + rn2)
    rank = contribution.mapValues(lambda cr: 0.15 + 0.85 * cr)
    break
예제 #26
0
    else:
        return 0,0,0
    
rdd31 = rdd30.mapValues(stat_overdue)


# In[ ]:


rdd32 = rdd31.map(lambda x:)


# In[10]:


rdd31.persist(StorageLevel(False,True,True,False,))


# In[ ]:


rdd31


# In[11]:


hdfs_workdir


# In[11]:
예제 #27
0
#lines = sc.textFile(folder).map(lambda l: list(imap(string.strip, l.split(';')))).cache

# if lines.count() % 7 != 0:
#    print "missing complete weeks in dataset"
#    #exit()

r = lines.filter(lambda x: municipio(x[9])) \
    .filter(lambda x: validate(x[3])) \
    .filter (lambda x: cavallo_week(x[3])) \
    .map(lambda x: ((x[1], cell2municipi[x[9]] , week_month(x[3]),  is_we(x[3]) , day_of_week(x[3]) , day_time(x[4])), 1)) \
    .distinct() \
    .map(lambda x: ((x[0][:4] + (x[0][5],)), 1)) \
    .reduceByKey(lambda x, y: x + y) \
    .map(lambda x: (x[0][0], [x[0][1:] + (x[1],), ])) \
    .reduceByKey(lambda x, y: x + y) \
    .persist(StorageLevel(False, True, False, False))

###
# Carrello format: user -> [(municipio, settimana, weekend/workday, time_slice, count),...]
# nota: count= day of presence in the region at the timeslice

# week ordering
# keys: region,busiest week,workday/we,timeslice
r = r.map(lambda x: (x[0],
                     sorted(x[1],
                            key=lambda w:
                            (w[0], sum([z[4] for z in x[1]
                                        if z[1] == w[1]]), -w[2], w[3]),
                            reverse=True)))

r = r.map(lambda x: (x[0], normalize(x[1])))
예제 #28
0
def CalculateUser(line):
    station_val_map = station_broad.value
    imsi = line[0].split("#")[0]
    station = line[0].split("#")[1]
    visit_num = line[1]
    return (imsi, visit_num * station_val_map[station])

def list2map(l, n):
    m = {}
    for e in l:
        m[e[0]] = e[1] * 1.0 / n * 100
    return m


user_station_num = sqlContext.read.json("/input/zp/user_traj_403.json").rdd.flatMap(userStationNum)\
    .reduceByKey(lambda x, y: x + y).persist(StorageLevel(False, True, False, False, 1))

user_val = user_station_num.map(lambda line: (line[0].split("#")[0], 100))\
    .reduceByKey(lambda x, y : x).collect()
user_val_map = list2map(user_val, 100)
user_broad = sc.broadcast(user_val_map)
station_val = user_station_num.map(lambda line: (line[0].split("#")[1], 100))\
    .reduceByKey(lambda x, y : x).collect()
station_val_map = list2map(station_val, 100)
station_broad = sc.broadcast(station_val_map)

for i in range(30):
    station_val = user_station_num.map(CalculateStation).reduceByKey(lambda x, y: x + y).collect()
    station_val_max = 0
    for v in station_val:
        if v[1] > station_val_max:
def main(filter_threshold, support, input_file, output_file):
    sc = SparkContext('local[*]', 'srinagapavanisirichandana_pisupati_task2')
    sc.setLogLevel("OFF")
    counts = {}


    def freq_product(a, b, k):
        # aka join
        res = []
        joined = map(lambda x: tuple(sorted(x)), itertools.combinations(dict.fromkeys(itertools.chain.from_iterable(a)), k))
        for r in joined:
             if all_pairs_frequent(r, k):
                 res += [r]
        return res


    def all_pairs_frequent(r, i):
        if i < 3:
            return True
        for x in itertools.combinations(r, i - 1):
            if filter_combinations(x):
                return False
        return True


    def filter_combinations(item_combination):
        cs = counts#.value
        if item_combination in cs:
            return cs[item_combination] < support
        else:
            return True


    init_time = time.time()
    textRDD = sc.textFile(input_file)
    baskets_items = textRDD.filter(lambda l: l != "user_id,business_id").map(
        lambda l: l.split(',')).map(lambda x: (x[0], x[1])).groupByKey().map(
        lambda x: dict.fromkeys(x[1])).filter(lambda x: len(x) > filter_threshold).persist(
                StorageLevel(True, True, False, False, 1))

    basket_items_with_sizes = baskets_items.flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
    C = {1: sorted(basket_items_with_sizes.map(lambda x: (x[0],)).toLocalIterator())}
    freq_set = sorted(basket_items_with_sizes.filter(lambda x: x[1] >= support).map(lambda x: (x[0],)).toLocalIterator())
    L = {1: freq_set}
    k = 1


    def cmap(item_set, cs):
        cnts = {}
        for c in cs:
            cnts[c] = 0
            if all(elem in item_set for elem in c):
                cnts[c] += 1
        return cnts

    while len(L[k]) > 1:
        k += 1
        C[k] = freq_product(L[k - 1], L[k - 1], k)
        counts.update(baskets_items.map(lambda itemset: cmap(itemset, C[k])).fold({},add_dicts))
        L[k] = list(itertools.filterfalse(filter_combinations, C[k]))

    fp = open(output_file, "w")
    fp.write("Candidates:\n")
    for c in C:
        if C[c]:
            if (c == 1):
                print(*["('"+elem[0]+"')" for elem in C[c]], sep=",", file=fp)
            else:
                print(*(sorted(C[c])), sep=",", file=fp)

    fp.write("\nFrequent Itemsets:\n")
    for l in L:
        if L[l]:
            if (l == 1):
                print(*["('"+elem[0]+"')" for elem in L[l]], sep=",", file=fp)
            else:
                print(*(sorted(L[l])), sep=",", file=fp)

    fp.close()

    final_time = time.time() - init_time
    print("Duration:", final_time, "seconds")
예제 #30
0
        total_score += PRIORPOLARITY[lexicons[w]['priorpolarity']] * TYPE[
            lexicons[w]['type']]

    return total_score


# Make sure Python uses UTF-8 as tweets contains emoticon and unicode
reload(sys)
sys.setdefaultencoding('utf-8')

# Use SQLContext for better support
sqlContext = SQLContext(sc)

# Define storage level
DISK_ONLY_2 = StorageLevel(True, False, False, False, 2)
MEMORY_AND_DISK = StorageLevel(True, True, False, False, 1)

# Read GNIP's JSON file
directory = "tweets"
datasets = sqlContext.read.json(directory)
log('# Completed reading JSON files')

# Check checksum count
file_count = datasets.where(datasets['verb'].isNull()).count()
expect('file_count', file_count, 21888)

# Check post count
all_posts = datasets.where(datasets['verb'] == 'post')
all_posts_count = all_posts.count()
expect('all_posts_count', all_posts_count, 1570398)