births_hashed = births_transformed \
    .rdd \
    .map(lambda row: [
            list(hashing.transform(row[1]).toArray())
                if col == 'BIRTH_PLACE'
                else row[i]  #一行一行的扫描  把第一行的所有字段扫描到,在下一行row[0],row[1]
            for i, col
            in enumerate(features_to_keep)]) \
    .map(lambda row: [[e] if type(e) == int else e
                      for e in row]) \
    .map(lambda row: [item for sublist in row
                      for item in sublist]) \
    .map(lambda row: reg.LabeledPoint(
            row[0],
            ln.Vectors.dense(row[1:]))
        )
'''
map1
births_hashed = births_transformed \
    .rdd \
    .map(lambda row: [
            list('1')
                if col == 'BIRTH_PLACE'
                else row[i]
            for i, col
            in enumerate(features_to_keep)])
结果
每一行
BIRTH_PLACE列替换成[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]
Пример #2
0
plt.bar(pos, y_axis, width = width, color='lightblue')
fig = plt.gcf()
fig.set_size_inches(12, 7)

plt.plot()
plt.show()

'''

import pyspark.mllib.regression as mllib_reg
import pyspark.mllib.linalg as mllib_lalg
import pyspark.mllib.classification as mllib_class
import pyspark.mllib.tree as mllib_tree

print "CONVERTING FEATURE VECTOR FORMAT TO LabeledPoint"
labeled_data = feature_vectors.map(lambda fields: mllib_reg.LabeledPoint(
    fields[-1], mllib_lalg.Vectors.dense(fields[:-1])))

print "SPLITTING TRAINING AND TESTING DATA"
train, test = labeled_data.randomSplit([0.7, 0.3], seed=13)

# parameters:
lamda = 1.0

print "TRAINING"

if load:
    nbay = mllib_class.NaiveBayesModel.load(sc, "nbay-l1.00")  #LOAD CLASSIFIER
# initialize classifier:
else:
    nbay = mllib_class.NaiveBayes.train(train, lamda)  #TRAIN CLASSIFIER
    def __init__(self, train, confidence_threshold=0.5):
        self.train = train
        self.test = []
        self.confident_data = 0  #count number of data points with confident prediction
        self.ambiguous_datacount = 0  #count number of data points with ambiguous prediction
        self.AccModel1 = 0.0
        self.AccModel2 = 0.0
        self.AccModel3 = 0.0
        self.ambiguous_data = []
        self.confidence_threshold = confidence_threshold
        # self.ambiguousTPs = [];
        # self.confidentTPs = [];
        # self.ambiguousTNs = [];
        # self.confidentTNs = [];
        # self.ambiguousFPs = [];
        # self.confidentFPs = [];
        # self.ambiguousFNs = [];
        # self.confidentFNs = [];
        self.maliciousSamples = []
        self.benignSamples = []
        # self.TNdict = {}
        # self.TPdict = {}
        # self.FNdict = {}
        # self.FPdict = {}
        self.TNarr = [[] for i in range(3)]  #[[],[],[]]
        self.TParr = [[] for j in range(3)]
        self.FNarr = [[] for k in range(3)]
        self.FParr = [[] for l in range(3)]
        self.TNRarr = ["" for a in range(3)]
        self.TPRarr = ["" for b in range(3)]
        self.FNRarr = ["" for c in range(3)]
        self.FPRarr = ["" for d in range(3)]
        self.PPVarr = ["" for e in range(3)]
        self.NPVarr = ["" for f in range(3)]
        self.FDRarr = ["" for g in range(3)]
        self.F1arr = ["" for h in range(3)]
        # defaultdict(dict)

        train_rdd = sc.parallelize(
            map(lambda x: regression.LabeledPoint(x[-1], x[:-1]), self.train))
        self.models = [
            # classification.LogisticRegressionWithSGD.train(data=train_rdd, iterations=10, step=1.0, miniBatchFraction=1.0, initialWeights=None, regParam=0.01, regType='l2', intercept=False, validateData=True, convergenceTol=0.001),
            # classification.LogisticRegressionWithLBFGS.train(data=train_rdd, iterations=10, initialWeights=None, regParam=0.01, regType='l2', intercept=False, corrections=10, tolerance=0.0001, validateData=True, numClasses=2),
            # classification.SVMWithSGD.train(data=train_rdd, iterations=10, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None, regType='l2', intercept=False, validateData=True, convergenceTol=0.001),
            # classification.NaiveBayes.train(data=train_rdd, lambda_=1.0),
            tree.DecisionTree.trainClassifier(data=train_rdd,
                                              numClasses=2,
                                              categoricalFeaturesInfo={},
                                              impurity='gini',
                                              maxDepth=5,
                                              maxBins=32,
                                              minInstancesPerNode=1,
                                              minInfoGain=0.0),
            tree.RandomForest.trainClassifier(data=train_rdd,
                                              numClasses=2,
                                              categoricalFeaturesInfo={},
                                              numTrees=3,
                                              featureSubsetStrategy='auto',
                                              impurity='gini',
                                              maxDepth=4,
                                              maxBins=32,
                                              seed=None),
            tree.GradientBoostedTrees.trainClassifier(
                data=train_rdd,
                categoricalFeaturesInfo={},
                loss='logLoss',
                numIterations=10,
                learningRate=0.1,
                maxDepth=3,
                maxBins=32)
        ]
Пример #4
0
def infant_survival_mllib():
	spark = SparkSession.builder.appName('infant-survival-mllib').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.StringType()),
		('BIRTH_YEAR', types.IntegerType()),
		('BIRTH_MONTH', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('MOTHER_RACE_6CODE', types.StringType()),
		('MOTHER_EDUCATION', types.StringType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('FATHER_EDUCATION', types.StringType()),
		('MONTH_PRECARE_RECODE', types.StringType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_BMI_RECODE', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.StringType()),
		('DIABETES_GEST', types.StringType()),
		('HYP_TENS_PRE', types.StringType()),
		('HYP_TENS_GEST', types.StringType()),
		('PREV_BIRTH_PRETERM', types.StringType()),
		('NO_RISK', types.StringType()),
		('NO_INFECTIONS_REPORTED', types.StringType()),
		('LABOR_IND', types.StringType()),
		('LABOR_AUGM', types.StringType()),
		('STEROIDS', types.StringType()),
		('ANTIBIOTICS', types.StringType()),
		('ANESTHESIA', types.StringType()),
		('DELIV_METHOD_RECODE_COMB', types.StringType()),
		('ATTENDANT_BIRTH', types.StringType()),
		('APGAR_5', types.IntegerType()),
		('APGAR_5_RECODE', types.StringType()),
		('APGAR_10', types.IntegerType()),
		('APGAR_10_RECODE', types.StringType()),
		('INFANT_SEX', types.StringType()),
		('OBSTETRIC_GESTATION_WEEKS', types.IntegerType()),
		('INFANT_WEIGHT_GRAMS', types.IntegerType()),
		('INFANT_ASSIST_VENTI', types.StringType()),
		('INFANT_ASSIST_VENTI_6HRS', types.StringType()),
		('INFANT_NICU_ADMISSION', types.StringType()),
		('INFANT_SURFACANT', types.StringType()),
		('INFANT_ANTIBIOTICS', types.StringType()),
		('INFANT_SEIZURES', types.StringType()),
		('INFANT_NO_ABNORMALITIES', types.StringType()),
		('INFANT_ANCEPHALY', types.StringType()),
		('INFANT_MENINGOMYELOCELE', types.StringType()),
		('INFANT_LIMB_REDUCTION', types.StringType()),
		('INFANT_DOWN_SYNDROME', types.StringType()),
		('INFANT_SUSPECTED_CHROMOSOMAL_DISORDER', types.StringType()),
		('INFANT_NO_CONGENITAL_ANOMALIES_CHECKED', types.StringType()),
		('INFANT_BREASTFED', types.StringType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_train.csv.gz', header=True, schema=schema)

	selected_features = [
		'INFANT_ALIVE_AT_REPORT', 
		'BIRTH_PLACE', 
		'MOTHER_AGE_YEARS', 
		'FATHER_COMBINED_AGE', 
		'CIG_BEFORE', 
		'CIG_1_TRI', 
		'CIG_2_TRI', 
		'CIG_3_TRI', 
		'MOTHER_HEIGHT_IN', 
		'MOTHER_PRE_WEIGHT', 
		'MOTHER_DELIVERY_WEIGHT', 
		'MOTHER_WEIGHT_GAIN', 
		'DIABETES_PRE', 
		'DIABETES_GEST', 
		'HYP_TENS_PRE', 
		'HYP_TENS_GEST', 
		'PREV_BIRTH_PRETERM'
	]
	births_trimmed = births.select(selected_features)

	recode_dictionary = {'YNU': {'Y': 1, 'N': 0, 'U': 0}}  # Yes/No/Unknown.

	def recode(col, key):
		return recode_dictionary[key][col]

	def correct_cig(feat):
		return func.when(func.col(feat) != 99, func.col(feat)).otherwise(0)

	rec_integer = func.udf(recode, types.IntegerType())

	births_transformed = births_trimmed \
		.withColumn('CIG_BEFORE', correct_cig('CIG_BEFORE')) \
		.withColumn('CIG_1_TRI', correct_cig('CIG_1_TRI')) \
		.withColumn('CIG_2_TRI', correct_cig('CIG_2_TRI')) \
		.withColumn('CIG_3_TRI', correct_cig('CIG_3_TRI'))

	cols = [(col.name, col.dataType) for col in births_trimmed.schema]
	YNU_cols = []
	for i, s in enumerate(cols):
		if s[1] == types.StringType():
			dis = births.select(s[0]).distinct().rdd.map(lambda row: row[0]).collect()
			if 'Y' in dis:
				YNU_cols.append(s[0])

	births.select(['INFANT_NICU_ADMISSION', 
		rec_integer('INFANT_NICU_ADMISSION', func.lit('YNU')).alias('INFANT_NICU_ADMISSION_RECODE')
	]).take(5)

	exprs_YNU = [rec_integer(x, func.lit('YNU')).alias(x) if x in YNU_cols else x for x in births_transformed.columns]
	births_transformed = births_transformed.select(exprs_YNU)
	births_transformed.select(YNU_cols[-5:]).show(5)

	# Calculate the descriptive statistics of the numeric features.
	numeric_cols = ['MOTHER_AGE_YEARS','FATHER_COMBINED_AGE',
		'CIG_BEFORE','CIG_1_TRI','CIG_2_TRI','CIG_3_TRI',
		'MOTHER_HEIGHT_IN','MOTHER_PRE_WEIGHT',
		'MOTHER_DELIVERY_WEIGHT','MOTHER_WEIGHT_GAIN'
	]
	numeric_rdd = births_transformed.select(numeric_cols).rdd.map(lambda row: [e for e in row])

	mllib_stats = mllib_stat.Statistics.colStats(numeric_rdd)

	for col, m, v in zip(numeric_cols,  mllib_stats.mean(), mllib_stats.variance()):
		print('{0}: \t{1:.2f} \t {2:.2f}'.format(col, m, np.sqrt(v)))

	# Calculate frequencies for the categorical variables.
	categorical_cols = [e for e in births_transformed.columns if e not in numeric_cols]
	categorical_rdd = births_transformed.select(categorical_cols).rdd.map(lambda row: [e for e in row])

	for i, col in enumerate(categorical_cols):
		agg = categorical_rdd.groupBy(lambda row: row[i]).map(lambda row: (row[0], len(row[1])))
		print(col, sorted(agg.collect(), key=lambda el: el[1], reverse=True))

	# Correlation.
	corrs = mllib_stat.Statistics.corr(numeric_rdd)

	for i, el in enumerate(corrs > 0.5):
		correlated = [(numeric_cols[j], corrs[i][j]) for j, e in enumerate(el) if e == 1.0 and j != i]
		if len(correlated) > 0:
			for e in correlated:
				print('{0}-to-{1}: {2:.2f}'.format(numeric_cols[i], e[0], e[1]))

	# Drop most of highly correlated features.
	features_to_keep = [
		'INFANT_ALIVE_AT_REPORT', 
		'BIRTH_PLACE', 
		'MOTHER_AGE_YEARS', 
		'FATHER_COMBINED_AGE', 
		'CIG_1_TRI', 
		'MOTHER_HEIGHT_IN', 
		'MOTHER_PRE_WEIGHT', 
		'DIABETES_PRE', 
		'DIABETES_GEST', 
		'HYP_TENS_PRE', 
		'HYP_TENS_GEST', 
		'PREV_BIRTH_PRETERM'
	]
	births_transformed = births_transformed.select([e for e in features_to_keep])

	#--------------------
	# Statistical testing.

	# Run a Chi-square test to determine if there are significant differences for categorical variables.
	for cat in categorical_cols[1:]:
	    agg = births_transformed.groupby('INFANT_ALIVE_AT_REPORT').pivot(cat).count()
	    agg_rdd = agg.rdd.map(lambda row: (row[1:])).flatMap(lambda row: [0 if e == None else e for e in row]).collect()

	    row_length = len(agg.collect()[0]) - 1
	    agg = mllib_linalg.Matrices.dense(row_length, 2, agg_rdd)

	    test = mllib_stat.Statistics.chiSqTest(agg)
	    print(cat, round(test.pValue, 4))

	#--------------------
	# Machine learning.

	# Create an RDD of LabeledPoints.
	hashing = mllib_feature.HashingTF(7)

	births_hashed = births_transformed \
		.rdd \
		.map(lambda row: [list(hashing.transform(row[1]).toArray()) if col == 'BIRTH_PLACE' else row[i] for i, col in enumerate(features_to_keep)]) \
		.map(lambda row: [[e] if type(e) == int else e for e in row]) \
		.map(lambda row: [item for sublist in row for item in sublist]) \
		.map(lambda row: mllib_regression.LabeledPoint(row[0], mllib_linalg.Vectors.dense(row[1:])))

	# Split into training and testing.
	births_train, births_test = births_hashed.randomSplit([0.6, 0.4])

	# Estimate a logistic regression model using a stochastic gradient descent (SGD) algorithm.
	LR_Model = LogisticRegressionWithLBFGS.train(births_train, iterations=10)

	# Predict the classes for our testing set.
	LR_results = (
		births_test.map(lambda row: row.label).zip(LR_Model.predict(births_test.map(lambda row: row.features)))
	).map(lambda row: (row[0], row[1] * 1.0))

	# Check how well or how bad our model performed.
	print('********************************************000')
	LR_evaluation = mllib_eval.BinaryClassificationMetrics(LR_results)
	print('********************************************001')
	print('Area under PR: {0:.2f}'.format(LR_evaluation.areaUnderPR))
	print('********************************************002')
	print('Area under ROC: {0:.2f}'.format(LR_evaluation.areaUnderROC))
	print('********************************************003')
	LR_evaluation.unpersist()

	# Select the most predictable features using a Chi-Square selector.
	selector = mllib_feature.ChiSqSelector(4).fit(births_train)

	topFeatures_train = (
		births_train.map(lambda row: row.label).zip(selector.transform(births_train.map(lambda row: row.features)))
	).map(lambda row: mllib_regression.LabeledPoint(row[0], row[1]))

	topFeatures_test = (
		births_test.map(lambda row: row.label).zip(selector.transform(births_test.map(lambda row: row.features)))
	).map(lambda row: mllib_regression.LabeledPoint(row[0], row[1]))

	# Build a random forest model.
	RF_model = RandomForest.trainClassifier(data=topFeatures_train, numClasses=2, categoricalFeaturesInfo={}, numTrees=6, featureSubsetStrategy='all', seed=666)

	RF_results = (topFeatures_test.map(lambda row: row.label).zip(RF_model.predict(topFeatures_test.map(lambda row: row.features))))

	RF_evaluation = mllib_eval.BinaryClassificationMetrics(RF_results)

	print('Area under PR: {0:.2f}'.format(RF_evaluation.areaUnderPR))
	print('Area under ROC: {0:.2f}'.format(RF_evaluation.areaUnderROC))
	RF_evaluation.unpersist()

	# See how the logistic regression would perform with reduced number of features.
	LR_Model_2 = LogisticRegressionWithLBFGS.train(topFeatures_train, iterations=10)

	LR_results_2 = (
		topFeatures_test.map(lambda row: row.label).zip(LR_Model_2.predict(topFeatures_test.map(lambda row: row.features)))
	).map(lambda row: (row[0], row[1] * 1.0))

	LR_evaluation_2 = mllib_eval.BinaryClassificationMetrics(LR_results_2)

	print('Area under PR: {0:.2f}'.format(LR_evaluation_2.areaUnderPR))
	print('Area under ROC: {0:.2f}'.format(LR_evaluation_2.areaUnderROC))
	LR_evaluation_2.unpersist()