births_hashed = births_transformed \ .rdd \ .map(lambda row: [ list(hashing.transform(row[1]).toArray()) if col == 'BIRTH_PLACE' else row[i] #一行一行的扫描 把第一行的所有字段扫描到,在下一行row[0],row[1] for i, col in enumerate(features_to_keep)]) \ .map(lambda row: [[e] if type(e) == int else e for e in row]) \ .map(lambda row: [item for sublist in row for item in sublist]) \ .map(lambda row: reg.LabeledPoint( row[0], ln.Vectors.dense(row[1:])) ) ''' map1 births_hashed = births_transformed \ .rdd \ .map(lambda row: [ list('1') if col == 'BIRTH_PLACE' else row[i] for i, col in enumerate(features_to_keep)]) 结果 每一行 BIRTH_PLACE列替换成[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]
plt.bar(pos, y_axis, width = width, color='lightblue') fig = plt.gcf() fig.set_size_inches(12, 7) plt.plot() plt.show() ''' import pyspark.mllib.regression as mllib_reg import pyspark.mllib.linalg as mllib_lalg import pyspark.mllib.classification as mllib_class import pyspark.mllib.tree as mllib_tree print "CONVERTING FEATURE VECTOR FORMAT TO LabeledPoint" labeled_data = feature_vectors.map(lambda fields: mllib_reg.LabeledPoint( fields[-1], mllib_lalg.Vectors.dense(fields[:-1]))) print "SPLITTING TRAINING AND TESTING DATA" train, test = labeled_data.randomSplit([0.7, 0.3], seed=13) # parameters: lamda = 1.0 print "TRAINING" if load: nbay = mllib_class.NaiveBayesModel.load(sc, "nbay-l1.00") #LOAD CLASSIFIER # initialize classifier: else: nbay = mllib_class.NaiveBayes.train(train, lamda) #TRAIN CLASSIFIER
def __init__(self, train, confidence_threshold=0.5): self.train = train self.test = [] self.confident_data = 0 #count number of data points with confident prediction self.ambiguous_datacount = 0 #count number of data points with ambiguous prediction self.AccModel1 = 0.0 self.AccModel2 = 0.0 self.AccModel3 = 0.0 self.ambiguous_data = [] self.confidence_threshold = confidence_threshold # self.ambiguousTPs = []; # self.confidentTPs = []; # self.ambiguousTNs = []; # self.confidentTNs = []; # self.ambiguousFPs = []; # self.confidentFPs = []; # self.ambiguousFNs = []; # self.confidentFNs = []; self.maliciousSamples = [] self.benignSamples = [] # self.TNdict = {} # self.TPdict = {} # self.FNdict = {} # self.FPdict = {} self.TNarr = [[] for i in range(3)] #[[],[],[]] self.TParr = [[] for j in range(3)] self.FNarr = [[] for k in range(3)] self.FParr = [[] for l in range(3)] self.TNRarr = ["" for a in range(3)] self.TPRarr = ["" for b in range(3)] self.FNRarr = ["" for c in range(3)] self.FPRarr = ["" for d in range(3)] self.PPVarr = ["" for e in range(3)] self.NPVarr = ["" for f in range(3)] self.FDRarr = ["" for g in range(3)] self.F1arr = ["" for h in range(3)] # defaultdict(dict) train_rdd = sc.parallelize( map(lambda x: regression.LabeledPoint(x[-1], x[:-1]), self.train)) self.models = [ # classification.LogisticRegressionWithSGD.train(data=train_rdd, iterations=10, step=1.0, miniBatchFraction=1.0, initialWeights=None, regParam=0.01, regType='l2', intercept=False, validateData=True, convergenceTol=0.001), # classification.LogisticRegressionWithLBFGS.train(data=train_rdd, iterations=10, initialWeights=None, regParam=0.01, regType='l2', intercept=False, corrections=10, tolerance=0.0001, validateData=True, numClasses=2), # classification.SVMWithSGD.train(data=train_rdd, iterations=10, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None, regType='l2', intercept=False, validateData=True, convergenceTol=0.001), # classification.NaiveBayes.train(data=train_rdd, lambda_=1.0), tree.DecisionTree.trainClassifier(data=train_rdd, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0), tree.RandomForest.trainClassifier(data=train_rdd, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy='auto', impurity='gini', maxDepth=4, maxBins=32, seed=None), tree.GradientBoostedTrees.trainClassifier( data=train_rdd, categoricalFeaturesInfo={}, loss='logLoss', numIterations=10, learningRate=0.1, maxDepth=3, maxBins=32) ]
def infant_survival_mllib(): spark = SparkSession.builder.appName('infant-survival-mllib').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.StringType()), ('BIRTH_YEAR', types.IntegerType()), ('BIRTH_MONTH', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('MOTHER_RACE_6CODE', types.StringType()), ('MOTHER_EDUCATION', types.StringType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('FATHER_EDUCATION', types.StringType()), ('MONTH_PRECARE_RECODE', types.StringType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_BMI_RECODE', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.StringType()), ('DIABETES_GEST', types.StringType()), ('HYP_TENS_PRE', types.StringType()), ('HYP_TENS_GEST', types.StringType()), ('PREV_BIRTH_PRETERM', types.StringType()), ('NO_RISK', types.StringType()), ('NO_INFECTIONS_REPORTED', types.StringType()), ('LABOR_IND', types.StringType()), ('LABOR_AUGM', types.StringType()), ('STEROIDS', types.StringType()), ('ANTIBIOTICS', types.StringType()), ('ANESTHESIA', types.StringType()), ('DELIV_METHOD_RECODE_COMB', types.StringType()), ('ATTENDANT_BIRTH', types.StringType()), ('APGAR_5', types.IntegerType()), ('APGAR_5_RECODE', types.StringType()), ('APGAR_10', types.IntegerType()), ('APGAR_10_RECODE', types.StringType()), ('INFANT_SEX', types.StringType()), ('OBSTETRIC_GESTATION_WEEKS', types.IntegerType()), ('INFANT_WEIGHT_GRAMS', types.IntegerType()), ('INFANT_ASSIST_VENTI', types.StringType()), ('INFANT_ASSIST_VENTI_6HRS', types.StringType()), ('INFANT_NICU_ADMISSION', types.StringType()), ('INFANT_SURFACANT', types.StringType()), ('INFANT_ANTIBIOTICS', types.StringType()), ('INFANT_SEIZURES', types.StringType()), ('INFANT_NO_ABNORMALITIES', types.StringType()), ('INFANT_ANCEPHALY', types.StringType()), ('INFANT_MENINGOMYELOCELE', types.StringType()), ('INFANT_LIMB_REDUCTION', types.StringType()), ('INFANT_DOWN_SYNDROME', types.StringType()), ('INFANT_SUSPECTED_CHROMOSOMAL_DISORDER', types.StringType()), ('INFANT_NO_CONGENITAL_ANOMALIES_CHECKED', types.StringType()), ('INFANT_BREASTFED', types.StringType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_train.csv.gz', header=True, schema=schema) selected_features = [ 'INFANT_ALIVE_AT_REPORT', 'BIRTH_PLACE', 'MOTHER_AGE_YEARS', 'FATHER_COMBINED_AGE', 'CIG_BEFORE', 'CIG_1_TRI', 'CIG_2_TRI', 'CIG_3_TRI', 'MOTHER_HEIGHT_IN', 'MOTHER_PRE_WEIGHT', 'MOTHER_DELIVERY_WEIGHT', 'MOTHER_WEIGHT_GAIN', 'DIABETES_PRE', 'DIABETES_GEST', 'HYP_TENS_PRE', 'HYP_TENS_GEST', 'PREV_BIRTH_PRETERM' ] births_trimmed = births.select(selected_features) recode_dictionary = {'YNU': {'Y': 1, 'N': 0, 'U': 0}} # Yes/No/Unknown. def recode(col, key): return recode_dictionary[key][col] def correct_cig(feat): return func.when(func.col(feat) != 99, func.col(feat)).otherwise(0) rec_integer = func.udf(recode, types.IntegerType()) births_transformed = births_trimmed \ .withColumn('CIG_BEFORE', correct_cig('CIG_BEFORE')) \ .withColumn('CIG_1_TRI', correct_cig('CIG_1_TRI')) \ .withColumn('CIG_2_TRI', correct_cig('CIG_2_TRI')) \ .withColumn('CIG_3_TRI', correct_cig('CIG_3_TRI')) cols = [(col.name, col.dataType) for col in births_trimmed.schema] YNU_cols = [] for i, s in enumerate(cols): if s[1] == types.StringType(): dis = births.select(s[0]).distinct().rdd.map(lambda row: row[0]).collect() if 'Y' in dis: YNU_cols.append(s[0]) births.select(['INFANT_NICU_ADMISSION', rec_integer('INFANT_NICU_ADMISSION', func.lit('YNU')).alias('INFANT_NICU_ADMISSION_RECODE') ]).take(5) exprs_YNU = [rec_integer(x, func.lit('YNU')).alias(x) if x in YNU_cols else x for x in births_transformed.columns] births_transformed = births_transformed.select(exprs_YNU) births_transformed.select(YNU_cols[-5:]).show(5) # Calculate the descriptive statistics of the numeric features. numeric_cols = ['MOTHER_AGE_YEARS','FATHER_COMBINED_AGE', 'CIG_BEFORE','CIG_1_TRI','CIG_2_TRI','CIG_3_TRI', 'MOTHER_HEIGHT_IN','MOTHER_PRE_WEIGHT', 'MOTHER_DELIVERY_WEIGHT','MOTHER_WEIGHT_GAIN' ] numeric_rdd = births_transformed.select(numeric_cols).rdd.map(lambda row: [e for e in row]) mllib_stats = mllib_stat.Statistics.colStats(numeric_rdd) for col, m, v in zip(numeric_cols, mllib_stats.mean(), mllib_stats.variance()): print('{0}: \t{1:.2f} \t {2:.2f}'.format(col, m, np.sqrt(v))) # Calculate frequencies for the categorical variables. categorical_cols = [e for e in births_transformed.columns if e not in numeric_cols] categorical_rdd = births_transformed.select(categorical_cols).rdd.map(lambda row: [e for e in row]) for i, col in enumerate(categorical_cols): agg = categorical_rdd.groupBy(lambda row: row[i]).map(lambda row: (row[0], len(row[1]))) print(col, sorted(agg.collect(), key=lambda el: el[1], reverse=True)) # Correlation. corrs = mllib_stat.Statistics.corr(numeric_rdd) for i, el in enumerate(corrs > 0.5): correlated = [(numeric_cols[j], corrs[i][j]) for j, e in enumerate(el) if e == 1.0 and j != i] if len(correlated) > 0: for e in correlated: print('{0}-to-{1}: {2:.2f}'.format(numeric_cols[i], e[0], e[1])) # Drop most of highly correlated features. features_to_keep = [ 'INFANT_ALIVE_AT_REPORT', 'BIRTH_PLACE', 'MOTHER_AGE_YEARS', 'FATHER_COMBINED_AGE', 'CIG_1_TRI', 'MOTHER_HEIGHT_IN', 'MOTHER_PRE_WEIGHT', 'DIABETES_PRE', 'DIABETES_GEST', 'HYP_TENS_PRE', 'HYP_TENS_GEST', 'PREV_BIRTH_PRETERM' ] births_transformed = births_transformed.select([e for e in features_to_keep]) #-------------------- # Statistical testing. # Run a Chi-square test to determine if there are significant differences for categorical variables. for cat in categorical_cols[1:]: agg = births_transformed.groupby('INFANT_ALIVE_AT_REPORT').pivot(cat).count() agg_rdd = agg.rdd.map(lambda row: (row[1:])).flatMap(lambda row: [0 if e == None else e for e in row]).collect() row_length = len(agg.collect()[0]) - 1 agg = mllib_linalg.Matrices.dense(row_length, 2, agg_rdd) test = mllib_stat.Statistics.chiSqTest(agg) print(cat, round(test.pValue, 4)) #-------------------- # Machine learning. # Create an RDD of LabeledPoints. hashing = mllib_feature.HashingTF(7) births_hashed = births_transformed \ .rdd \ .map(lambda row: [list(hashing.transform(row[1]).toArray()) if col == 'BIRTH_PLACE' else row[i] for i, col in enumerate(features_to_keep)]) \ .map(lambda row: [[e] if type(e) == int else e for e in row]) \ .map(lambda row: [item for sublist in row for item in sublist]) \ .map(lambda row: mllib_regression.LabeledPoint(row[0], mllib_linalg.Vectors.dense(row[1:]))) # Split into training and testing. births_train, births_test = births_hashed.randomSplit([0.6, 0.4]) # Estimate a logistic regression model using a stochastic gradient descent (SGD) algorithm. LR_Model = LogisticRegressionWithLBFGS.train(births_train, iterations=10) # Predict the classes for our testing set. LR_results = ( births_test.map(lambda row: row.label).zip(LR_Model.predict(births_test.map(lambda row: row.features))) ).map(lambda row: (row[0], row[1] * 1.0)) # Check how well or how bad our model performed. print('********************************************000') LR_evaluation = mllib_eval.BinaryClassificationMetrics(LR_results) print('********************************************001') print('Area under PR: {0:.2f}'.format(LR_evaluation.areaUnderPR)) print('********************************************002') print('Area under ROC: {0:.2f}'.format(LR_evaluation.areaUnderROC)) print('********************************************003') LR_evaluation.unpersist() # Select the most predictable features using a Chi-Square selector. selector = mllib_feature.ChiSqSelector(4).fit(births_train) topFeatures_train = ( births_train.map(lambda row: row.label).zip(selector.transform(births_train.map(lambda row: row.features))) ).map(lambda row: mllib_regression.LabeledPoint(row[0], row[1])) topFeatures_test = ( births_test.map(lambda row: row.label).zip(selector.transform(births_test.map(lambda row: row.features))) ).map(lambda row: mllib_regression.LabeledPoint(row[0], row[1])) # Build a random forest model. RF_model = RandomForest.trainClassifier(data=topFeatures_train, numClasses=2, categoricalFeaturesInfo={}, numTrees=6, featureSubsetStrategy='all', seed=666) RF_results = (topFeatures_test.map(lambda row: row.label).zip(RF_model.predict(topFeatures_test.map(lambda row: row.features)))) RF_evaluation = mllib_eval.BinaryClassificationMetrics(RF_results) print('Area under PR: {0:.2f}'.format(RF_evaluation.areaUnderPR)) print('Area under ROC: {0:.2f}'.format(RF_evaluation.areaUnderROC)) RF_evaluation.unpersist() # See how the logistic regression would perform with reduced number of features. LR_Model_2 = LogisticRegressionWithLBFGS.train(topFeatures_train, iterations=10) LR_results_2 = ( topFeatures_test.map(lambda row: row.label).zip(LR_Model_2.predict(topFeatures_test.map(lambda row: row.features))) ).map(lambda row: (row[0], row[1] * 1.0)) LR_evaluation_2 = mllib_eval.BinaryClassificationMetrics(LR_results_2) print('Area under PR: {0:.2f}'.format(LR_evaluation_2.areaUnderPR)) print('Area under ROC: {0:.2f}'.format(LR_evaluation_2.areaUnderROC)) LR_evaluation_2.unpersist()