示例#1
0
    def spark_transform(self):
        """
        transforms Spark RDD with raw data into the RDD that contains
        top-n pickup spots for each block and time slot
        """
        BatchTransformer.spark_transform(self)

        n = self.psql_config["topntosave"]

        # calculation of top-n spots for each block and time slot
        self.data = (self.data.map(lambda x: (
            (x["block_id"], x["time_slot"], x["sub_block_id"]), x["passengers"]
        )).reduceByKey(lambda x, y: x + y).map(lambda x: ((x[0][0], x[0][
            1]), [(x[0][2], x[1])])).reduceByKey(lambda x, y: x + y).mapValues(
                lambda vals: heapq.nlargest(n, vals, key=lambda x: x[1])).map(
                    lambda x: {
                        "block_id": x[0][0],
                        "time_slot": x[0][1],
                        "subblocks_psgcnt": x[1]
                    }))

        self.data.persist(pyspark.StorageLevel(True, True, False, False,
                                               3)).count()  # MEMORY_AND_DISK_3

        # recalculation of top-n, where for each key=(block_id, time_slot) top-n is calculated
        # based on top-n of (block_id, time_slot) and top-ns of (adjacent_block, time_slot+1)
        # from all adjacent blocks
        maxval = self.psql_config["upperBound"]
        self.data = (self.data.map(lambda x: ((x["block_id"], x[
            "time_slot"]), x["subblocks_psgcnt"])).flatMap(lambda x: [x] + [
                ((bl, (x[0][1] - 1) % maxval), x[1])
                for bl in helpers.get_neighboring_blocks(x[0][0])
            ]).reduceByKey(lambda x, y: x + y).mapValues(
                lambda vals: heapq.nlargest(n, vals, key=lambda x: x[1])).map(
                    lambda x: {
                        "block_latid":
                        x[0][0][0],
                        "block_lonid":
                        x[0][0][1],
                        "time_slot":
                        x[0][1],
                        "longitude": [
                            helpers.determine_subblock_lonlat(el[0])[0]
                            for el in x[1]
                        ],
                        "latitude": [
                            helpers.determine_subblock_lonlat(el[0])[1]
                            for el in x[1]
                        ],
                        "passengers": [el[1] for el in x[1]]
                    }))

        self.data.persist(pyspark.StorageLevel(True, True, False, False,
                                               3)).count()  # MEMORY_AND_DISK_3
def generate_digraph(edge_count=40, batch_size=50,
                     steps_to_python_gc=50, domain_size=50):
    sc = pyspark.context.SparkContext.getOrCreate()
    sc.setCheckpointDir("~/.transitive_closure")
    spark = pyspark.sql.SparkSession(sc)
    # Translation to Spark format is ludicrously slow without PyArrow
    spark.conf.set("spark.sql.execution.arrow.enabled", "true")
    batches_to_python_gc = steps_to_python_gc // batch_size
    batch_count = int(np.ceil(edge_count/batch_size))
    digraph=[]
    for i in range(batch_count):
        if i%batches_to_python_gc == 0:
            gc.collect()
        if i == batch_count-1 and edge_count % batch_size > 0:
            batch_size = edge_count % batch_size
        new_origins = np.random.randint(0,domain_size,dtype=np.int32,
                                        size=(batch_size,1))
        new_termini = np.random.randint(0,domain_size,dtype=np.int32,
                                        size=(batch_size,1))
        partial_digraph = pd.DataFrame(np.concatenate(
            [new_origins, new_termini], 1), columns=("origin", "terminus")
            )
        partial_digraph = spark.createDataFrame(partial_digraph).distinct()
        if digraph != []:
            digraph[0] = digraph[1].union(digraph[0])\
                                   .orderBy(["origin", "terminus"],
                                            ascending=[True, True])\
                                   .distinct()\
                                   .persist(pyspark.StorageLevel(True, False,
                                                                 False, True,
                                                                 1))\
                                   .checkpoint()
            digraph[1] = partial_digraph
        else:
            digraph = [partial_digraph,partial_digraph]
    if i == batch_count-1:
        gc.collect()
        digraph = digraph[1].union(digraph[0])\
                            .orderBy(["origin", "terminus"],
                                     ascending=[True, True])\
                            .distinct()\
                            .persist(pyspark.StorageLevel(True, False,
                                                          False, True,
                                                          1))\
                            .checkpoint()
    df=digraph.toPandas()
    sc.stop()
    return df
示例#3
0
def create_df_from_generator(gen, names):
    a = sc.parallelize(gen, 20)
    a.persist(pyspark.StorageLevel(True, True, False, True, 1))
    df = sqlContext.createDataFrame(a, schema=names,
                                    samplingRatio=None).repartition(20)
    #df.persist(pyspark.StorageLevel(True, True, False, True, 1))
    return df
def transitive_closure_from_dataframe(digraph):
    dir_path = os.path.dirname(os.path.realpath(__file__))
    save_file = os.path.join(dir_path,'solution.pickle')
    try:
        shutil.rmtree(save_file)
    except:
        pass
    sc = pyspark.context.SparkContext.getOrCreate()
    sc.setCheckpointDir("~/.transitive_closure")
    spark = pyspark.sql.SparkSession(sc)
    # Translation to Spark format is ludicrously slow with PyArrow
    spark.conf.set("spark.sql.execution.arrow.enabled", "true")
    start_time=time.time()
    orig_digraph = spark.createDataFrame(digraph.copy())
    new_edges = spark.createDataFrame(digraph.copy()).checkpoint()
    new_edges_mem = digraph
    closed_digraph = spark.createDataFrame(digraph).checkpoint()
    while not new_edges_mem.empty:
        new_edges = spark.createDataFrame(new_edges_mem).persist(
            pyspark.StorageLevel(False, True, False, False, 1)).checkpoint()
        new_edges = new_edges.join(orig_digraph,
                                   (new_edges.terminus == orig_digraph.origin))\
                             .select(new_edges.origin, orig_digraph.terminus)\
                             .union(orig_digraph.join(new_edges,
                                 (new_edges.origin == orig_digraph.terminus))\
                                 .select(orig_digraph.origin, new_edges.terminus))\
                             .distinct()\
                             .exceptAll(closed_digraph)
        # I don't see any copy method, and PyArrow is nearly
        # instantaneous,  so I'm going to use pandas to copy the new
        # edges to memory.
        new_edges_mem = new_edges.toPandas().copy()
        closed_digraph = closed_digraph.union(new_edges.persist(
            pyspark.StorageLevel(True, False, False, True, 1)
            ).checkpoint())
        # Putting this in just because of how much trouble I had with the
        # generate_digraph function
        gc.collect()
    # Ideally, we would be able to use the fact that closed_digraph is
    # already stored on the disk, but I'm not quite sure how to do it.
    # Also, for clusters with separate hard drives,
    # we would want to mount it somewhere with HDFS or similar.
    closed_digraph.rdd.saveAsPickleFile(save_file)
    df = closed_digraph.toPandas()
    sc.stop()
    return df
示例#5
0
def create_dataframe(spark_context, sql_context, table, column_names):

    data = spark_context.parallelize(table, 20)
    data.persist(pyspark.StorageLevel(True, True, False, True, 1))

    df = sql_context.createDataFrame(data,
                                     schema=column_names,
                                     samplingRatio=None).repartition(20)
    # df.persist(pyspark.StorageLevel(True, True, False, True, 1))

    return df
示例#6
0
    return result
    #print(personName+","+changeManager+","+promote+","+str(ManagerCata.index(lastManager)) )


def getCataDict():
    return {0: 2, 1: 2, 2: len(ManagerCata)}


couchbase_host = '10.1.193.189'
couchbase_bucket = 'persona'

couchbucket = Couchbase.connect(bucket=couchbase_bucket, host=couchbase_host)

# reference http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#module-pyspark.mllib.tree

pyspark.StorageLevel(True, False, False, False, 1)
sc = SparkContext("local", "ad RD leave predict")

if __name__ == '__main__':
    prepareLeaverList('../var/all2015rdleaves')
    data = []
    l = 500
    for i in range(6):
        data.extend(buildDataFromCouchbase(l, l * i))

    cataDict = getCataDict()

    model = GradientBoostedTrees.trainClassifier(sc.parallelize(data),
                                                 cataDict,
                                                 numIterations=10,
                                                 maxBins=500)
示例#7
0
 def __init__(self):
     pyspark.StorageLevel(True, True, False, False, 2)
示例#8
0
def main(result_dir_master, result_dir_s3):
    
    CON_CONFIGS = {}
    CON_CONFIGS["result_dir_master"] = result_dir_master
    CON_CONFIGS["result_dir_s3"] = result_dir_s3

    #
    ## user to specify: hyper-params
    
    # clustering
    CON_CONFIGS["n_clusters"] = 3
    CON_CONFIGS["warn_threshold_np_ratio"] = 1
    
    # classification
    CON_CONFIGS["n_eval_folds"] = 5
    CON_CONFIGS["n_cv_folds"] = 5  
    
    CON_CONFIGS["lambdas"] = list(10.0 ** numpy.arange(-2, 2, 1.0))
    CON_CONFIGS["alphas"] = list(numpy.linspace(0, 1, 3))
        
    CON_CONFIGS["desired_recalls"] = [0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2,0.225,0.25]
    # CON_CONFIGS["desired_recalls"] = [0.05,0.10]
    
    
    
    #
    ## read data and some meta studff
    
    
    # user to specify : seed in Random Forest model
    CON_CONFIGS["seed"] = 42
    CON_CONFIGS["data_path"] = "s3://emr-rwes-pa-spark-dev-datastore/lichao.test/data/BI/smaller_data/"
    CON_CONFIGS["pos_file"] = "pos_70.0pct.csv"
    CON_CONFIGS["neg_file"] = "neg_70.0pct.csv"
    CON_CONFIGS["ss_file"] = "ss_70.0pct.csv"
    #reading in the data from S3
    spark = SparkSession.builder.appName(os.path.basename(__file__)).getOrCreate()
    org_pos_data = spark.read.option("header", "true")\
        .option("inferSchema", "true")\
        .csv(CON_CONFIGS["data_path"] + CON_CONFIGS["pos_file"])
    org_neg_data = spark.read.option("header", "true")\
        .option("inferSchema", "true")\
        .csv(CON_CONFIGS["data_path"] + CON_CONFIGS["neg_file"])\
        .select(org_pos_data.columns)
    org_ss_data = spark.read.option("header", "true")\
        .option("inferSchema", "true")\
        .csv(CON_CONFIGS["data_path"] +CON_CONFIGS["ss_file"])\
        .select(org_pos_data.columns)
    
    
    # user to specify: original column names for predictors and output in data
    orgOutputCol = "label"
    matchCol = "matched_positive_id"
    patIDCol = "patid"
    nonFeatureCols = [matchCol, orgOutputCol, patIDCol]
    # orgPredictorCols = ["PATIENT_AGE", "LOOKBACK_DAYS", "LVL3_CHRN_ISCH_HD_FLAG", "LVL3_ABN_CHST_XRAY_FLAG"]
    # org_pos_data = org_pos_data.select(nonFeatureCols + orgPredictorCols)
    # org_neg_data = org_neg_data.select(nonFeatureCols + orgPredictorCols)
    # org_ss_data = org_ss_data.select(nonFeatureCols + orgPredictorCols)
    # sanity check 
    if type(org_pos_data.select(orgOutputCol).schema.fields[0].dataType) not in (DoubleType, IntegerType):
        raise TypeError("The output column is not of type integer or double. ")
    org_pos_data = org_pos_data.withColumn(orgOutputCol, org_pos_data[orgOutputCol].cast("double"))
    orgPredictorCols = [x for x in org_pos_data.columns if x not in nonFeatureCols]    
    orgPredictorCols4Clustering = [x for x in orgPredictorCols if "FLAG" in x]
    if type(org_neg_data.select(orgOutputCol).schema.fields[0].dataType) not in (DoubleType, IntegerType):
        raise TypeError("The output column is not of type integer or double. ")
    org_neg_data = org_neg_data.withColumn(orgOutputCol, org_neg_data[orgOutputCol].cast("double"))
    if type(org_ss_data.select(orgOutputCol).schema.fields[0].dataType) not in (DoubleType, IntegerType):
        raise TypeError("The output column is not of type integer or double. ")
    org_ss_data = org_ss_data.withColumn(orgOutputCol, org_ss_data[orgOutputCol].cast("double"))
    # 
    clusterFeatureCol = "cluster_features"
    clusterCol = "cluster_id"
    # user to specify: the collective column name for all predictors
    collectivePredictorCol = "features"
    # in-cluster distance
    distCol = "dist"
    # user to specify: the column name for prediction
    predictionCol = "probability"
    
    CON_CONFIGS["orgPredictorCols"] = orgPredictorCols
    CON_CONFIGS["orgPredictorCols4Clustering"] = orgPredictorCols4Clustering
    CON_CONFIGS["n_predictors_classification"] = len(orgPredictorCols)
    CON_CONFIGS["n_rows_pos"] = org_pos_data.count()
    CON_CONFIGS["n_rows_neg"] = org_neg_data.count()
    CON_CONFIGS["n_rows_ss"] = org_ss_data.count()
    save_analysis_info(\
        result_dir_master, 
        "analysis_info.txt", 
        CON_CONFIGS
        )
    
    
    
    
    
    
    
    # convert to ml-compatible format
    assembler = VectorAssembler(inputCols=orgPredictorCols, outputCol=collectivePredictorCol)
    posFeatureAssembledData = assembler.transform(org_pos_data)\
        .select(nonFeatureCols + [collectivePredictorCol])
    negFeatureAssembledData = assembler.transform(org_neg_data)\
        .select(nonFeatureCols + [collectivePredictorCol])
    #
    evalIDCol = "evalFoldID"
    cvIDCol = "cvFoldID"
    pos_neg_data = posFeatureAssembledData.union(negFeatureAssembledData)
    pos_neg_data_with_eval_ids = AppendDataMatchingFoldIDs(pos_neg_data, CON_CONFIGS["n_eval_folds"], matchCol, foldCol=evalIDCol)
    
    
    # the model (pipeline)
    classifier_spec = LogisticRegression(maxIter=1e5, featuresCol = collectivePredictorCol,
                            labelCol = orgOutputCol, standardization = True)
    evaluator = BinaryClassificationEvaluatorWithPrecisionAtRecall(\
        rawPredictionCol=predictionCol,
        labelCol=orgOutputCol,
        metricName="precisionAtGivenRecall",
        metricParams={"recallValue":0.05}\
    )
    paramGrid = ParamGridBuilder()\
               .addGrid(classifier_spec.regParam, CON_CONFIGS["lambdas"])\
               .addGrid(classifier_spec.elasticNetParam, CON_CONFIGS["alphas"])\
               .build()

    # cross-evaluation
    predictionsAllData = None
    
    kmeans = KMeans(featuresCol=clusterFeatureCol, predictionCol=clusterCol).setK(CON_CONFIGS["n_clusters"])
    cluster_assembler = VectorAssembler(inputCols=orgPredictorCols4Clustering, outputCol=clusterFeatureCol)
    
    metricSets = [{"metricName": "precisionAtGivenRecall", "metricParams": {"recallValue": x}} for x in CON_CONFIGS["desired_recalls"]]
    
    filename_loop_info = result_dir_master + "loop_info.txt"
    file_loop_info = open(filename_loop_info, "w")    
    
    inputTrivialNegPredCols = ["_pos_prob", "_neg_prob"]
    trivial_neg_pred_assembler = VectorAssembler(inputCols=inputTrivialNegPredCols, outputCol=predictionCol)
    
    for iFold in range(CON_CONFIGS["n_eval_folds"]):
        
        
        condition = pos_neg_data_with_eval_ids[evalIDCol] == iFold
        leftoutFold = pos_neg_data_with_eval_ids.filter(condition).drop(evalIDCol)
        trainFolds = pos_neg_data_with_eval_ids.filter(~condition).drop(evalIDCol)
        
        file_loop_info.write("####################################################################\n\n".format(iFold))
        file_loop_info.write("iFold: {}\n\n".format(iFold))
        file_loop_info.write("n_rows of leftoutFold: {}\n".format(leftoutFold.count()))
        file_loop_info.write("n_rows of trainFolds: {}\n".format(trainFolds.count()))        
        
        #
        ## clustering to be done here
        
        pos_data_4_clustering = trainFolds\
            .filter(F.col(orgOutputCol)==1)\
            .select(patIDCol)\
            .join(org_pos_data, patIDCol)
        pos_data_4_clustering_assembled = cluster_assembler.transform(pos_data_4_clustering)\
            .select([patIDCol, matchCol] + [clusterFeatureCol])
        cluster_model, clustered_pos = clustering(pos_data_4_clustering_assembled, kmeans, 
                                    clusterFeatureCol, clusterCol, distCol) 
        
        nPosesAllClusters = clustered_pos.count()
        predictionsOneFold = None
        
        file_loop_info.write("nPosesAllClusters: {}\n".format(nPosesAllClusters))
        
        for i_cluster in range(CON_CONFIGS["n_clusters"]):
        
            file_loop_info.write("i_cluster: {}\n\n".format(i_cluster))
            
            # the positive data for training the classifier
            train_pos = clustered_pos\
                .filter(clustered_pos[clusterCol]==i_cluster)\
                .select(patIDCol)\
                .join(trainFolds, patIDCol)
            
            file_loop_info.write("n_rows of train_pos: {}\n".format(train_pos.count()))
            posPctThisClusterVSAllClusters = float(train_pos.count()) / nPosesAllClusters
            file_loop_info.write("posPctThisClusterVSAllClusters: {}\n".format(posPctThisClusterVSAllClusters))
            # select negative training data based on the clustering result
            corresponding_neg = train_pos\
                .select(matchCol)\
                .join(org_neg_data, matchCol)
            corresponding_neg_4_clustering_assembled = cluster_assembler.transform(corresponding_neg)\
                .select([patIDCol, matchCol] + [clusterFeatureCol])
            similar_neg_ids = select_certain_pct_ids_per_positive_closest_to_cluster_centre(\
                corresponding_neg_4_clustering_assembled, 
                clusterFeatureCol, 
                cluster_model.clusterCenters()[i_cluster], 
                posPctThisClusterVSAllClusters, 
                patIDCol,
                matchCol
            )
            train_data = similar_neg_ids\
                .join(trainFolds, patIDCol)\
                .select(train_pos.columns)\
                .union(train_pos)
            file_loop_info.write("n_rows of train_data: {}\n".format(train_data.count()))
            
            trainDataWithCVFoldID = AppendDataMatchingFoldIDs(train_data, CON_CONFIGS["n_cv_folds"], matchCol, foldCol=cvIDCol)
            trainDataWithCVFoldID.coalesce(int(trainFolds.rdd.getNumPartitions() * posPctThisClusterVSAllClusters) + 1)
            # sanity check: if there are too few negatives for any positive 
            # thresh_n_neg_per_fold = round(train_pos.count() / float(CON_CONFIGS["n_cv_folds"])) * CON_CONFIGS["warn_threshold_np_ratio"]
            # neg_counts_all_cv_folds = trainDataWithCVFoldID\
                # .filter(F.col(orgOutputCol)==0)\
                # .groupBy(cvIDCol)\
                # .agg(F.count(orgOutputCol).alias("_tmp"))\
                # .select("_tmp")\
                # .collect()
            # if any(map(lambda x: x["_tmp"] < thresh_n_neg_per_fold, neg_counts_all_cv_folds)):
                # raise ValueError("Insufficient number of negative data in at least one cv fold.")
                
            
        
        
            #
            ## train the classifier     
            

            validator = CrossValidatorWithStratificationID(\
                            estimator=classifier_spec,
                            estimatorParamMaps=paramGrid,
                            evaluator=evaluator,
                            stratifyCol=cvIDCol\
                        )
            cvModel = validator.fit(trainDataWithCVFoldID)
            
            
            #
            ## test data
            
            
            entireTestData = org_ss_data\
                .join(leftoutFold.filter(F.col(orgOutputCol)==1).select(matchCol), matchCol).select(org_pos_data.columns)\
                .union(org_pos_data.join(leftoutFold.select(patIDCol), patIDCol).select(org_pos_data.columns))\
                .union(org_neg_data.join(leftoutFold.select(patIDCol), patIDCol).select(org_pos_data.columns))
            entireTestDataAssembled4Clustering = cluster_assembler.transform(entireTestData)\
                    .select([patIDCol, matchCol] + [clusterFeatureCol])
            file_loop_info.write("n_rows of entireTestData: {}\n".format(entireTestData.count()))
            
            filteredTestData = select_certain_pct_overall_ids_closest_to_cluster_centre(\
                entireTestDataAssembled4Clustering, 
                clusterFeatureCol, 
                cluster_model.clusterCenters()[i_cluster], 
                posPctThisClusterVSAllClusters, 
                patIDCol
            ).join(entireTestData, patIDCol)
            
            file_loop_info.write("n_rows of filteredTestData: {}\n".format(filteredTestData.count()))
            
            filteredTestDataAssembled = assembler.transform(filteredTestData)\
                .select(nonFeatureCols + [collectivePredictorCol])       
            
            # testing
            
            predictions = cvModel\
                .transform(filteredTestDataAssembled)\
                .select(nonFeatureCols + [collectivePredictorCol, predictionCol])
            
            # need to union the test data filtered away (all classified as negative)
            
            discarded_test_ids = entireTestData\
                .select(patIDCol)\
                .subtract(filteredTestData.select(patIDCol))
            discardedTestData = discarded_test_ids\
                .join(entireTestData, patIDCol)
            discardedTestDataAssembled = assembler.transform(discardedTestData, )\
                .select(nonFeatureCols + [collectivePredictorCol])
            predictionsDiscardedTestData = discardedTestDataAssembled\
                .withColumn(inputTrivialNegPredCols[0], F.lit(0.0))\
                .withColumn(inputTrivialNegPredCols[1], F.lit(1.0))
            predictionsDiscardedTestDataAssembled = trivial_neg_pred_assembler\
                .transform(predictionsDiscardedTestData)\
                .select(predictions.columns)
            
            predictionsEntireTestData = predictions.union(predictionsDiscardedTestDataAssembled)
            
            
            
            
            
            metricValuesOneCluster = evaluator\
                .evaluateWithSeveralMetrics(predictionsEntireTestData, metricSets = metricSets)            
            file_name_metrics_one_cluster = result_dir_master + "metrics_cluster_" + str(i_cluster) + "fold_" + str(iFold) + "_.csv"
            save_metrics(file_name_metrics_one_cluster, metricValuesOneCluster)
            predictionsEntireTestData.write.csv(result_dir_s3 + "predictions_fold_" + str(iFold) + "_cluster_" + str(i_cluster) + ".csv")
            predictionsEntireTestData.persist(pyspark.StorageLevel(True, False, False, False, 1))

            if predictionsOneFold is not None:
                predictionsOneFold = predictionsOneFold.union(predictionsEntireTestData)
            else:
                predictionsOneFold = predictionsEntireTestData
            
            # save the metrics for all hyper-parameter sets in cv
            cvMetrics = cvModel.avgMetrics
            cvMetricsFileName = result_dir_s3 + "cvMetrics_cluster_" + str(i_cluster) + "_fold_" + str(iFold)
            cvMetrics.coalesce(4).write.csv(cvMetricsFileName, header="true")

            # save the hyper-parameters of the best model
            
            bestParams = validator.getBestModelParams()
            file_best_params = result_dir_master + "bestParams_cluster_" + str(i_cluster) + "_fold_" + str(iFold) + ".txt"
            with open(file_best_params, "w") as fileBestParams:
                fileBestParams.write(str(bestParams))
            os.chmod(file_best_params, 0o777)
            
        
        # summarise all clusters from the fold
        
        metricValuesOneFold = evaluator\
            .evaluateWithSeveralMetrics(predictionsOneFold, metricSets = metricSets)            
        file_name_metrics_one_fold = result_dir_master + "metrics_fold_" + str(iFold) + "_.csv"
        save_metrics(file_name_metrics_one_fold, metricValuesOneFold)
        
        if predictionsAllData is not None:
            predictionsAllData = predictionsAllData.union(predictionsOneFold)
        else:
            predictionsAllData = predictionsOneFold
            

    # save all predictions
    predictionsFileName = result_dir_s3 + "predictionsAllData"
    predictionsAllData.select(orgOutputCol,
                              getitem(1)(predictionCol).alias('prob_1'))\
        .write.csv(predictionsFileName, header="true")
    # metrics of predictions on the entire dataset
    metricValues = evaluator\
        .evaluateWithSeveralMetrics(predictionsAllData, metricSets = metricSets)
    save_metrics(result_dir_master + "metricValuesEntireData.csv", metricValues)
    
    file_loop_info.close()
    os.chmod(file_loop_info, 0o777)
    
    spark.stop()