def read_csv(spark, file_name): sql_context = SQLContext(spark) df = sql_context.read.format('com.databricks.spark.csv').options( header='true', format="string").load(file_name) dateIndexer = StringIndexer(inputCol="date", outputCol="date_index").fit(df) serialIndexer = StringIndexer(inputCol="serial_number", outputCol="serial_number_index").fit(df) modelIndexer = StringIndexer(inputCol="model", outputCol="model_index").fit(df) df1 = dateIndexer.transform(df) df2 = serialIndexer.transform(df1) df3 = modelIndexer.transform(df2) df3 = df3.na.fill("0") f_cols = df3.columns[5:] for name in f_cols: # df3 = df3.withColumn(name, "0" if df3[name] == "null" else df3[name]) df3 = df3.withColumn(name, df3[name].cast("double")) # df3.show() assembler = VectorAssembler(inputCols=f_cols, outputCol="indexedFeatures") df4 = assembler.transform(df3) return df4
def __clean_data(self, df, is_fraud="isfraud"): ignore = [is_fraud, 'label'] #Removendo colunas não utilizadas df = df.drop(*['paysim_id', 'nameorig', 'namedest']) #String Indexing string_indexer = StringIndexer(inputCol="type", outputCol="type_numeric").fit(df) df = string_indexer.transform(df) df = df.drop(df.type) #One-hot encoding encoder = OneHotEncoder(inputCol="type_numeric", outputCol="type_vector") df = encoder.transform(df) df = df.drop("type_numeric") #Label encoding label_stringIdx = StringIndexer(inputCol=is_fraud, outputCol='label').fit(df) df = label_stringIdx.transform(df) df = df.drop(is_fraud) #Vector Assembling assembler = VectorAssembler( inputCols=[x for x in df.columns if x not in ignore], outputCol='features') df = assembler.transform(df) # dataframe in the correct format selectedCols = ['label', 'features'] df = df.select(selectedCols) return df
def load_csv(): raw_df = spark.read.format("csv") \ .option("header", "true") \ .option("mode", "DROPMALFORMED") \ .load(csv_path) data_df = raw_df.select("Cardholder Last Name", "Cardholder First Initial", "Amount", "Vendor", "Year-Month") \ .select( concat(col("Cardholder Last Name"), lit(" "), col("Cardholder First Initial")).alias("u"), concat(col("Vendor")).alias("m"), col("Year-Month").alias("date"), col("Amount") ) userIndexer = StringIndexer(inputCol="u", outputCol="uid").fit(data_df) itemIndexer = StringIndexer(inputCol="m", outputCol="mid").fit(data_df) data_df = itemIndexer.transform(userIndexer.transform(data_df)) \ .withColumn("uid", (col("uid") + 1).cast(FloatType())) \ .withColumn("mid", (col("mid") + 1).cast(FloatType())) \ .cache() month_seq_udf = udf(lambda s: _date_to_month(s)) uDF = data_df.select("uid", "u").distinct().orderBy("uid") mDF = data_df.select("mid", "m").distinct().orderBy("mid") tDF = data_df.filter(data_df["uid"] <= u_limit).filter(data_df["mid"] <= m_limit) \ .withColumn("month", month_seq_udf(col("date"))) \ .drop("u", "m") return uDF, mDF, tDF
def main(train_x, train_y, test_x, test_y=None, idf=False, ngram=1, base='gs', asm=False): # Load : DF[id, url, features, label?] # The DataFrames only have a labels column if labels are given. # We drop the text, since Naive Bayes doesn't use it and we already have all the tokens kind = 'asm' if asm else 'bytes' train = elizabeth.load(train_x, train_y, base=base, kind=kind).drop('text') test = elizabeth.load(test_x, test_y, base=base, kind=kind).drop('text') # convert the string labels to numeric indices # the handleInvalid param allows the label indexer to deal with labels that weren't seen during fitting label_indexer = StringIndexer(inputCol='label', outputCol='indexedLabel', handleInvalid="skip") label_indexer = label_indexer.fit(train) train = label_indexer.transform(train) # the test set won't always have labels if test_y is not None: test = label_indexer.transform(test) index_labeller = IndexToString(inputCol='prediction', outputCol='predictedClass', labels=label_indexer.labels) # Train the preprocessor and transform the data. prep = elizabeth.Preprocessor() prep.add(NGram(n=int(ngram))) prep.add(CountVectorizer()) if idf: prep.add(IDF()) train = prep.fit(train) test = prep.transform(test) # Naive Bayes : DF[id, url, text, features, label?, rawPrediction, probability, prediction] nb = NaiveBayes(labelCol='indexedLabel').fit(train) test = nb.transform(test) test = index_labeller.transform( test) # DF[id, url, ... prediction, predictedClass] # If labels are given for the test set, print a score.s if test_y: test = test.orderBy(test.id) test = test.withColumn( 'correct', (test.label == test.predictedClass).cast('double')) test = test.select(avg(test.correct)) print(test.show()) # If no labels are given for the test set, print predictions. else: test = test.orderBy(test.id).select(test.predictedClass) test = test.rdd.map(lambda row: int(row.predictedClass)) test = test.toLocalIterator() print(*test, sep='\n')
def main(spark): ''' Parameters ---------- spark : SparkSession object ''' test_file = 'hdfs:/user/bm106/pub/project/cf_test.parquet' test = spark.read.parquet(test_file) test.createOrReplaceTempView('test') w = Window.partitionBy("user_id") def ratio_count(c, w): return (col(c) / count(c).over(w)) test = test.select("user_id", "track_id", ratio_count("count", w).alias("count")) test.createOrReplaceTempView('test') print("Ratio scores done") train_sample = spark.read.parquet('hdfs:/user/dev241/extension4_ratio.parquet') train_sample.createOrReplaceTempView('train_sample') print("Training sample ext4 loaded") StringIndexer = PipelineModel.load('hdfs:/user/dev241/DieterStringIndexer') test_idx = StringIndexer.transform(test) train_idx = StringIndexer.transform(train_sample) #change to best rank = 78 alpha = 14.287069059772636 reg = 0.41772043857578584 model = ALSModel.load("Extension4_ratio") print('Model loaded') #test ranking metrics test_idx = test_idx.select('user_idx','track_idx','count') test_users = test_idx.select('user_idx').distinct() test_comb = test_idx.groupBy('user_idx').agg(F.collect_set('track_idx').alias('test_labels')) track_number = 500 rec_test = spark.read.parquet('hdfs:/user/dev241/rec_test4.parquet') print('Rec test loaded.') join = test_comb.join(rec_test,test_comb.user_idx == rec_test.user_idx) print('Join done.') j4 = join.toDF('user_idx', 'test_labels','user_idx2','recommendations') j4.write.parquet("ext4join") print('j4 parquet written') predictionAndLabels = join.rdd.map(lambda r: ([track.track_idx for track in r.recommendations], r.test_labels)) print('Map done.') metrics = RankingMetrics(predictionAndLabels) print('RM done.') mavgp = metrics.meanAveragePrecision print("Test mean Average Precision : ",mavgp) pass
def main(spark): ''' Parameters ---------- spark : SparkSession object ''' # File names test_file = 'hdfs:/user/bm106/pub/project/cf_test.parquet' train_sample_file = 'hdfs:/user/ah3243/extension1_count_greater_1.parquet' # Reading the parquet files test = spark.read.parquet(test_file) train_sample = spark.read.parquet(train_sample_file) # StringIndexer print("String Indexer entered") StringIndexer = PipelineModel.load('hdfs:/user/dev241/DieterStringIndexer') test_idx = StringIndexer.transform(test) train_idx = StringIndexer.transform(train_sample) print("String Indexer done") #change to best rank = 78 alpha = 14.287069059772636 reg = 0.41772043857578584 #model als = ALS(rank=rank, alpha=alpha, regParam=reg, userCol="user_idx", itemCol="track_idx", ratingCol="count", coldStartStrategy="drop", implicitPrefs=True) model = als.fit(train_idx) print("Model fit for test done") model.save("Test_Model") print("Model save for test done") #test ranking metrics test_idx = test_idx.select('user_idx', 'track_idx', 'count') test_users = test_idx.select('user_idx').distinct() test_comb = test_idx.groupBy('user_idx').agg( F.collect_set('track_idx').alias('test_labels')) track_number = 500 rec_test = model.recommendForUserSubset(test_users, track_number) join = test_comb.join(rec_test, test_comb.user_idx == rec_test.user_idx) predictionAndLabels = join.rdd.map(lambda r: ( [track.track_idx for track in r.recommendations], r.test_labels)) metrics = RankingMetrics(predictionAndLabels) mavgp = metrics.meanAveragePrecision print("Test mean Average Precision : ", mavgp) pass
def indexData(df_sample): df_sche = df_sample.schema.fields for s in df_sche: n = s.name if (n != "target") & (n != "id"): print(n) indexer = StringIndexer(inputCol=n, outputCol=n + "_index").fit(df_sample) df_sample = indexer.transform(df_sample).drop(n) elif n == "id": indexer = StringIndexer(inputCol=n, outputCol=n + "_index").fit(df_sample) df_sample = indexer.transform(df_sample) return df_sample
def train_test(self, df): df = self.dropNonTCPUDP(df) catCols = [] numCols = ['avg_ipt', 'bytes_in', 'bytes_out', 'entropy', 'total_entropy', 'num_pkts_out', 'num_pkts_in', 'duration'] labelCol = 'label' data = self.get_dummy(df, catCols, numCols, labelCol) data.show() labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(data) labelIndexer.transform(data) featureIndexer = VectorIndexer(inputCol="features", \ outputCol="indexedFeatures").fit(data) featureIndexer.transform(data) (trainingData, testData) = data.randomSplit([0.7, 0.3]) trainingData.cache() # trainingData.repartition(200) testData.cache() # testData.repartition(200) trainingData.show(5,False) testData.show(5,False) rf = RandomForestClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel') gbt = GBTClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel') logr = LogisticRegression(featuresCol='indexedFeatures', labelCol='indexedLabel') # Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt, labelConverter]) model = pipeline.fit(trainingData) predictions = model.transform(testData) # Select example rows to display. predictions.select("features","label","predictedLabel", "prediction") # Select (prediction, true label) and compute test error print(self.getTestError(predictions)) self.printMetrics(predictions) # print(self.ExtractFeatureImp(model.stages[-2].featureImportances, testData, "features")) return model
def get_sample_data(): ''' This function loads and returns the iris datatset for example purposes. Arguments: None Returns: data {PySpark Dataframe} -- Returns the iris dataset ''' iris = datasets.load_iris() data1 = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target']) data = spark.createDataFrame(data1) # vectorize all numerical columns into a single feature column feature_cols = data.columns[:-1] assembler = VectorAssembler(inputCols=feature_cols, outputCol='features') data = assembler.transform(data) # convert text labels into indices data = data.select(['features', 'target']) label_indexer = StringIndexer(inputCol='target', outputCol='label').fit(data) data = label_indexer.transform(data) # only select the features and label column data = data.select(['features', 'label']) return data
def isLabelIndexed(self, schemaData, label, dataset): isLabelIndexed = "no" labelIndexedInfo = {} labelIndexer = None for schemaVal in schemaData: if (str(schemaVal.dataType) == "StringType" and schemaVal.name == label): labelIndexer = StringIndexer( inputCol=label, outputCol=PredictiveConstants.INDEXED_ + label, handleInvalid="keep").fit(dataset) dataset = labelIndexer.transform(dataset) label = PredictiveConstants.INDEXED_ + label isLabelIndexed = "yes" if (str(schemaVal.dataType) != "StringType" and schemaVal.name == label): label = label isLabelIndexed = "no" labelIndexedInfo.update({ PredictiveConstants.DATASET: dataset, PredictiveConstants.ISLABELINDEXED: isLabelIndexed, PredictiveConstants.LABELINDEXER: labelIndexer }) return labelIndexedInfo
def run(start1, end1, start2, end2, df, sc, sql_context, is_pred): lp_data= get_labeled_points(start1, end2, df, sc, sql_context) print lp_data.count() labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(lp_data) td = labelIndexer.transform(lp_data) label2index = {} for each in sorted(set([(i[0], i[1]) for i in td.select(td.label, td.indexedLabel).distinct().collect()]), key=lambda x: x[0]): label2index[int(each[0])] = int(each[1]) print label2index featureIndexer = \ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(lp_data) rf = get_model() pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) lp_train = lp_data.filter(lp_data.date3<end1).filter(lp_data.is_labeled == 1) model = pipeline.fit(lp_train) lp_check = lp_data.filter(lp_data.date2>start2) predictions = model.transform(lp_check) predictions = val(predictions, label2index, sql_context) if is_pred: predictions = predictions.filter(predictions.is_labeled ==0).filter(predictions.date2 == get_cur()).sort(predictions.prob.desc()) dfToTableWithPar(sql_context, predictions, "predictions", get_cur()) for each in predictions.take(10): print each
def predict(doc): tweet = Row(source=doc['source'], retweet_count=doc['retweet_count'], favorite_count=doc['favorite_count'], is_retweet=doc['is_retweet'], sentiment_compound=sentiment_analyze(doc['text'], "compound"), sentiment_neg=sentiment_analyze(doc['text'], "neg"), sentiment_neu=sentiment_analyze(doc['text'], "neu"), sentiment_pos=sentiment_analyze(doc['text'], "pos"), hour=util.convertUTCtoHourOfDay(doc['created_at']), day=util.convertUTCtoDay(doc['created_at']), week=util.convertUTCtoWeekNumber(doc['created_at']), month=util.convertUTCtoMonth(doc['created_at']), year=util.convertUTCtoYear(doc['created_at']) ) tweet_df = spark.createDataFrame([tweet]) str_indexer = StringIndexer().setInputCol("source").setOutputCol("source_index").fit(tweet_df) tweet_df2 = str_indexer.transform(tweet_df) tweet_df3 = tweet_df2.select([col(c).cast("double").alias(c) for c in tweet_df2.columns]) predictions = model.transform(tweet_df3) result = predictions.select("prediction").collect() if len(result)>0: return result[0]['prediction'] return None
def parse_data(self, path_ratings, nrows): df_ratings = self.sqlContext.read.csv(path_ratings, header=True, quote='"').limit(nrows) # self.data.count() raw_to_uid = StringIndexer(inputCol="user_id", outputCol="UID").fit(df_ratings) self.data = raw_to_uid.transform(df_ratings) raw_to_iid = StringIndexer(inputCol="business_id", outputCol="IID").fit(df_ratings) self.data = raw_to_iid.transform(self.data) # uid and iid must be integers for spark ALS self.data = self.data.rdd.map(\ lambda r: (int(r['UID']), \ int(r['IID']), \ float(r['stars'])))\ .toDF(("UID", "IID", "stars"))
def review_ids_to_number(dataframe): #build indexer model for user_id indexer_user = StringIndexer(inputCol ="user_id",outputCol="user_id_num").fit(dataframe) indexer_user_save = os.path.join('model','user_ind_model') indexer_user.write().overwrite().save(indexer_user_save) #build indexer model for business_id indexer_business = StringIndexer(inputCol ="business_id",outputCol="business_id_num",handleInvalid="skip").fit(dataframe) indexer_business_save = os.path.join('model', 'bus_ind_model') indexer_business.write().overwrite().save(indexer_business_save) #transform id columns to string indexed = indexer_user.transform(dataframe) final_indexed = indexer_business.transform(indexed) final_indexed.show(20) #save fitted strtingIndexer models final_indexed_save = os.path.join('dataset','review_vegas_als.parquet') final_indexed.write.mode('overwrite').parquet(final_indexed_save) logger.error('Indexed dataframe for ALS traing saved to review_vegas_als.parquet') logger.error('{} seconds has elapsed'.format(time.time() - start_time))
def ProcessData(df): df = df.withColumn("label", df["Cancelled"].cast(IntegerType())) # categoricalColumns = ['Origin','Dest'] #Categorical to Continuous/Ordinal/assigning the index categoricalColumns = ['Origin', 'Dest'] for categoricalCol in categoricalColumns: stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + 'Index').fit(df) df = stringIndexer.transform(df) #One Hot Encoder # encoder = OneHotEncoderEstimator(inputCols=["OriginIndex", "DestIndex"], # outputCols=["categoryVec1", "categoryVec2"]) # model = encoder.fit(df) # encoded = model.transform(df) # for categoricalCol in categoricalColumns: # stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index').fit(df) # df=stringIndexer.transform(df) # df = df.withColumn("YearInt", df["Year"].cast(IntegerType())) df = df.withColumn("MonthInt", df["Month"].cast(IntegerType())) df = df.withColumn("DayofMonthInt", df["DayofMonth"].cast(IntegerType())) df = df.withColumn("DayofWeekInt", df["DayOfWeek"].cast(IntegerType())) df = df.withColumn("DepTimeInt", df["DepTime"].cast(IntegerType())) df = df.withColumn("CRSDepTimeInt", df["CRSDepTime"].cast(IntegerType())) df = df.withColumn("ArrTimeInt", df["ArrTime"].cast(IntegerType())) df = df.withColumn("CRSArrTimeInt", df["CRSArrTime"].cast(IntegerType())) df = df.withColumn("ActualElapsedTimeInt", df["ActualElapsedTime"].cast(IntegerType())) df = df.withColumn("CRSElapsedTimeInt", df["CRSElapsedTime"].cast(IntegerType())) df = df.withColumn("ArrDelayInt", df["ArrDelay"].cast(IntegerType())) df = df.withColumn("DepDelayInt", df["DepDelay"].cast(IntegerType())) df = df.withColumn("DistanceInt", df["Distance"].cast(IntegerType())) #df= df.withColumn("label", df["Cancelled"].cast(IntegerType())) # encoder = OneHotEncoderEstimator(inputCols=["OriginIndex", "DestIndex"], # outputCols=["categoryVec1", "categoryVec2"]) # model = encoder.fit(df) # encoded = model.transform(df) # assembler = VectorAssembler(inputCols=[ "YearInt", "MonthInt", "DayofMonthInt", "DayofWeekInt", "DepTimeInt", "CRSDepTimeInt", "ArrTimeInt", "CRSArrTimeInt", "ActualElapsedTimeInt", "CRSElapsedTimeInt", "ArrDelayInt", "DepDelayInt", "OriginIndex", "DestIndex", "DistanceInt" ], outputCol="features") # assembler = VectorAssembler(inputCols=["YearInt","MonthInt","DayofMonthInt","DayofWeekInt","DepTimeInt","CRSDepTimeInt","ActualElapsedTimeInt","CRSElapsedTimeInt","ArrDelayInt","DepDelayInt","OriginIndex","DestIndex","DistanceInt"], outputCol="features") df = assembler.transform(df) return df """============================================================================================================="""
def vectorizeData(df, labelsCol, weighClass=False, featsCol=None): """Creates dataset from spark DataFrame of mixed categorical and numerical features. The function returns only two columns 'label' and 'features'. The input Spark dataframe is 'df'. The column name corresponding to the training labels must be provided in 'labelsCol'.""" assert labelsCol in df.columns # 'labelsCol' is not in df.columns # Importantly: replace numerical values by zero and categorical values by "NONE" (string) df = df.fillna(0).fillna("NONE") stringColList = [ i[0] for i in df.dtypes if (i[1] == 'string' and i[0] != labelsCol) ] # Indexing categorical features (string types) indexedCategoricalCols = [ categoricalCol + "Index" for categoricalCol in stringColList ] stages = [ StringIndexer( inputCol=categoricalCol, outputCol=idx_categoricalCol, ) for categoricalCol, idx_categoricalCol in zip( stringColList, indexedCategoricalCols) ] indexer = Pipeline(stages=stages) df = indexer.fit(df).transform(df) # Assembling indexed and numeric features numericColList = [ i[0] for i in df.dtypes if (i[1] != 'string' and i[0] != labelsCol) ] assemblerInputs = indexedCategoricalCols + numericColList assembler = VectorAssembler( inputCols=assemblerInputs, outputCol="features" if featsCol is None else featsCol) df = assembler.transform(df) # Indexing binary labels labeller = StringIndexer(inputCol=labelsCol, outputCol="label").fit(df) df = labeller.transform(df).select( ["features" if featsCol is None else featsCol, "label"]) if weighClass: from sklearn.utils.class_weight import compute_class_weight as weigh labels = [int(i.label) for i in df.select('label').collect()] wC0, wC1 = list( weigh(class_weight='balanced', classes=[0.0, 1.0], y=labels)) return assemblerInputs, df.withColumn( 'weight', F.when(df.label == 0.0, wC0).otherwise(wC1)) else: return assemblerInputs, df
def main(spark, train_data_file, rank_val, reg, alpha_val, user_indexer_model, item_indexer_model, model_file): ''' Parameters ---------- spark : SparkSession object data_file : string, path to the parquet file to load model_file : string, path to store the serialized model file ''' # Load the parquet file train = spark.read.parquet(train_data_file) #val = spark.read.parquet(val_data_file) #transform data indexer_user = StringIndexer(inputCol="user_id", outputCol="user", handleInvalid="skip").fit(train) indexer_item = StringIndexer(inputCol="track_id", outputCol="item", handleInvalid="skip").fit(train) als = ALS(userCol='user', itemCol='item', implicitPrefs=True, ratingCol='count', rank=rank_val, regParam=reg, alpha=alpha_val) pipeline = Pipeline(stages=[indexer_user, indexer_item, als]) train = indexer_user.transform(train) train = indexer_item.transform(train) model = als.fit(train) indexer_user.save(user_indexer_model) indexer_item.save(item_indexer_model) model.save(model_file)
def labelIndexing(self, sentimentInfoData): labelColm = sentimentInfoData.get(pc.LABELCOLM) dataset = sentimentInfoData.get(pc.DATASET) indexedLabel = pc.INDEXED_ + labelColm #check if the datatype of the col is integer or float or double. if yes then no need to do the indexing. '''for now converting each datatypes to string and then indexing it.''' dataset = dataset.withColumn(labelColm, dataset[labelColm].cast(StringType())) labelIndexer = StringIndexer(inputCol=labelColm, outputCol=indexedLabel, handleInvalid="keep").fit(dataset) dataset = labelIndexer.transform(dataset) #storeLabelIndexer = labelIndexer.write().overwrite().save("") # will update this later sentimentInfoData.update({ pc.INDEXEDCOLM: indexedLabel, pc.DATASET: dataset }) return sentimentInfoData
def random_forest(df, seed, num_of_trees_list): # Drop preferred_foot because it's the only categorical column, the others are all numerical # Use preferred_foot if we have time to implement it df = df.drop("preferred_foot") # Create a new column for the team_position label that is numerical instead of categorical labelIndexer = StringIndexer(inputCol="team_position", outputCol="indexed_label").fit(df) df = labelIndexer.transform(df) list_of_features = df.drop("team_position").drop("indexed_label").columns # Get list of all features assembler = VectorAssembler(inputCols=list_of_features, outputCol="indexed_features") df = assembler.transform(df) (training_data, testing_data) = df.randomSplit([0.8, 0.2], seed) # Split the training and testing data accuracy_list = [] cm_list = [] # List of confusion matrices for num_of_trees in num_of_trees_list: random_forest = RandomForestClassifier(labelCol="indexed_label", featuresCol="indexed_features", impurity="entropy", numTrees=num_of_trees, maxDepth=10) model = random_forest.fit(training_data) predictions = model.transform(testing_data) evaluator = MulticlassClassificationEvaluator(labelCol="indexed_label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) accuracy_list.append(accuracy) y_true = predictions.select(['indexed_label']).collect() y_pred = predictions.select(['prediction']).collect() print("Classification report and confusion matrix for Random Forest with " + str(num_of_trees) + " trees:") print(classification_report(y_true, y_pred)) cm = confusion_matrix(y_true, y_pred) confusion_matrix_corrected = [[cm[1][1], cm[1][2], cm[1][0]], [cm[2][1], cm[2][2], cm[2][0]], [cm[0][1], cm[0][2], cm[0][0]]] print("") print(confusion_matrix_corrected[0]) print(confusion_matrix_corrected[1]) print(confusion_matrix_corrected[2]) cm_list.append(np.array([confusion_matrix_corrected[0], confusion_matrix_corrected[1], confusion_matrix_corrected[2]])) return accuracy_list, cm_list
def add_demo(self): import pyspark try: return self.spark.read.parquet(self.cur_demo_file_name).withColumnRenamed("HADM_ID", "ID") except pyspark.sql.utils.AnalysisException as ex: template = "An exception of type {0} occurred. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) self.logger.info(message) self.logger.info("PROCESS") from pyspark.sql.functions import datediff,col from pyspark.ml.feature import OneHotEncoder, StringIndexer from pyspark.ml.feature import VectorAssembler cur_demo = self.spark.read.parquet(self.data_dir + "/ADMISSIONS").select("SUBJECT_ID", "HADM_ID", "ADMITTIME", "ADMISSION_TYPE", "ADMISSION_LOCATION", "INSURANCE", "LANGUAGE", "RELIGION", "MARITAL_STATUS", "ETHNICITY") cur_pts = self.spark.read.parquet(self.data_dir + "/PATIENTS").select("SUBJECT_ID", "DOB", "GENDER") merged_demo = cur_demo.join(cur_pts,"SUBJECT_ID").drop("SUBJECT_ID") merged_demo = merged_demo.withColumn("AGE",datediff("ADMITTIME","DOB")/365.0).withColumn("AGE",when(col("AGE")>90,90).otherwise(col("AGE"))).drop("ADMITTIME","DOB").where("AGE > 18").fillna("N/A") target_col = merged_demo.columns target_col.remove("AGE") target_col.remove("HADM_ID") target_col.sort() self.logger.debug(target_col) vector_target = ["AGE"] demo_col_list = ["AGE"] for cat_col in target_col: SI_model= StringIndexer(inputCol=cat_col, outputCol="SI_{0}".format(cat_col)).fit(merged_demo) demo_col_list = demo_col_list+[demo_var+"||"+demo_info for demo_var, demo_info in (zip([cat_col]*len(SI_model.labels),SI_model.labels))] merged_demo = SI_model.transform(merged_demo) merged_demo = OneHotEncoder(inputCol="SI_{0}".format(cat_col),outputCol="OH_{0}".format(cat_col), dropLast=False).transform(merged_demo) vector_target.append("OH_{0}".format(cat_col)) import json json.dump({"demo_feature":demo_col_list},open(self.json_demo_feature_dump_loc,"w")) sorted(vector_target) self.logger.debug( vector_target) return_df = VectorAssembler(inputCols=vector_target,outputCol="demo_feature").transform(merged_demo) return_df.write.save(self.cur_demo_file_name) return_df = self.spark.read.parquet(self.cur_demo_file_name).withColumnRenamed("HADM_ID", "ID").select("ID","demo_feature") return return_df
def naive_bayes(df, seed): # Drop preferred_foot because it's the only categorical column, the others are all numerical # Use preferred_foot if we have time to implement it df = df.drop("preferred_foot") labelIndexer = StringIndexer(inputCol="team_position", outputCol="label").fit(df) df = labelIndexer.transform(df) df = df.drop("team_position") list_of_features = df.drop("label").columns # Get list of all features assembler = VectorAssembler(inputCols=list_of_features, outputCol="features") df = assembler.transform(df) (train_data, test_data) = df.randomSplit([0.8, 0.2], seed) n_bayes = NaiveBayes(smoothing=1.0, modelType="multinomial") model = n_bayes.fit(train_data) # Training happens here predictions = model.transform(test_data) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) y_true = predictions.select(['label']).collect() y_pred = predictions.select(['prediction']).collect() print("Classification report and confusion matrix for Naive Bayes:") print(classification_report(y_true, y_pred)) cm = confusion_matrix(y_true, y_pred) confusion_matrix_corrected = [[cm[1][1], cm[1][2], cm[1][0]], [cm[2][1], cm[2][2], cm[2][0]], [cm[0][1], cm[0][2], cm[0][0]]] print("") print(confusion_matrix_corrected[0]) print(confusion_matrix_corrected[1]) print(confusion_matrix_corrected[2]) cm = np.array([confusion_matrix_corrected[0], confusion_matrix_corrected[1], confusion_matrix_corrected[2]]) return accuracy, cm
def stringIndexer(infoData): colmToIndex = infoData.get(mc.COLMTOINDEX) dataset = infoData.get(mc.DATASET) indexedColm = infoData.get(mc.INDEXEDCOLM) storageLocation = infoData.get(mc.STORAGELOCATION) indexerName = colmToIndex + mc.INDEXER file = storageLocation + indexerName # check if the datatype of the col is integer or float or double. if yes then no need to do the indexing-- sahil. '''for now converting each datatypes to string and then indexing it.''' dataset = dataset.withColumn(colmToIndex, dataset[colmToIndex].cast(StringType())) stringIndexer = StringIndexer(inputCol=colmToIndex, outputCol=indexedColm, handleInvalid="keep").fit(dataset) dataset = stringIndexer.transform(dataset) stringIndexer.write().overwrite().save(file) # will update this later indexerPathMapping = infoData.get(mc.INDEXERPATHMAPPING) indexerPathMapping.update({colmToIndex: file}) infoData.update({ mc.INDEXERPATHMAPPING: indexerPathMapping, mc.DATASET: dataset }) return infoData
def index_nominals(dataframe, renamer=lambda string: u"indexed_%s" % string): """Create indexed versions of nominal features in the given dataframe.""" all_cols = dataframe.columns schema = dataframe.schema names_by_idx = [str(name) for name in schema.names] types_by_idx = [field.dataType for field in schema.fields] labels_by_idx = [[] for idx in range(len(all_cols))] dataframe_indexing = dataframe # The new (or old if not indexed) column names. columns = [] # Fit and apply a sequence of nominal feature indexer. for idx, col in enumerate(all_cols): if types_by_idx[idx] is StringType(): # Encode nominal features into doubles. indexer = StringIndexer( inputCol=col, outputCol=renamer(col)).fit(dataframe_indexing) labels_by_idx[idx] = indexer.labels dataframe_indexing = indexer.transform(dataframe_indexing) columns.append(renamer(col)) else: labels_by_idx[idx] = [] columns.append(col) # Create the object that holds the information necessary to get # column and value names for the various converted features and # values. namer = ValueMapper(columns_by_idx=names_by_idx, types_by_idx=types_by_idx, values_by_idx=labels_by_idx) return dataframe_indexing, columns, namer
def get_rs(args): from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.recommendation import ALS from pyspark.ml.feature import StringIndexer from pyspark.sql import SparkSession import random import string def id_generator(size=6, chars=string.ascii_uppercase + string.digits): return ''.join(random.choice(chars) for _ in range(size)) spark = SparkSession.builder.appName('Session_%s' %id_generator()).getOrCreate() df_train = spark.read.parquet("./cf_train_subsampled.parquet") df_val = spark.read.parquet("./cf_validation_subsampled.parquet") df_test = spark.read.parquet("./cf_test_subsampled.parquet") # train contains all user, but not all tracks user_indexer = StringIndexer(inputCol="user_id", outputCol="user_id_numeric").fit(df_train) track_indexer = StringIndexer(inputCol="track_id", outputCol="track_id_numeric").fit(df_train.union(df_val)) df_train = user_indexer.transform(df_train) df_train = track_indexer.transform(df_train) df_val = user_indexer.transform(df_val) df_val = track_indexer.transform(df_val) df_test = user_indexer.transform(df_test) df_test = track_indexer.transform(df_test) rank,regParam,alpha = args als = ALS(rank=rank, maxIter=10, regParam=regParam, alpha=alpha, implicitPrefs = True, userCol="user_id_numeric", itemCol="track_id_numeric", ratingCol="count", coldStartStrategy="drop") #model = als.trainImplicit(df_train) model = als.fit(df_train) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(df_val) evaluator = RegressionEvaluator(metricName="rmse", labelCol="count", predictionCol="prediction") rmse = evaluator.evaluate(predictions) return [rank,regParam,alpha,rmse]
sep="\t", inferSchema="true", header="false") test = spark.read.load("hdfs://10.190.2.112/data/test_set.txt", format="csv", sep="\t", inferSchema="true", header="false") # only for feature transform total = train.union(val).union(test) # create features indexer = StringIndexer(inputCol="_c2", outputCol="c22") indexer = indexer.fit(total) train = indexer.transform(train) val = indexer.transform(val) test = indexer.transform(test) # create label indexer = StringIndexer(inputCol="_c6", outputCol="label") indexer = indexer.fit(total) train = indexer.transform(train) val = indexer.transform(val) test = indexer.transform(test) # One-hot encoder encoder = OneHotEncoder(inputCol="c22", outputCol="c2") train = encoder.transform(train) val = encoder.transform(val) test = encoder.transform(test) # create the trainer and set its parameters
df = df.select('asin','reviewerID','overall') df.printSchema() df.show() #encoding ID's to fit in model from pyspark.ml.feature import StringIndexer a = StringIndexer(inputCol="reviewerID", outputCol="reviewerIDIndex", handleInvalid='skip') r = a.fit(df) indexedDf = r.transform(df) indexedDf.show() asinIndexer = StringIndexer(inputCol="asin", outputCol="asinIndex",handleInvalid='skip') a = asinIndexer.fit(df) indexedDf = a.transform(indexedDf) indexedDf.show() from pyspark.sql.types import IntegerType from pyspark.sql.functions import regexp_replace indexedDf = indexedDf.withColumn("reviewerID", indexedDf["reviewerIDIndex"].cast(IntegerType())) indexedDf = indexedDf.withColumn("asin",indexedDf["asinIndex"].cast(IntegerType())) #indexedDf.show() #indexedDf.toPandas().to_csv(indexedDf.csv, header=True, index=False) indexedDf = indexedDf.select('asin','reviewerID','overall') indexedDf.show() print(indexedDf.count())
# COMMAND ---------- data.show(5) # COMMAND ---------- from pyspark.ml.feature import StringIndexer # COMMAND ---------- indexer = StringIndexer(inputCol='Cruise_line', outputCol='cruise_idx').fit(data) # COMMAND ---------- data = indexer.transform(data) # COMMAND ---------- data.show(5) # COMMAND ---------- from pyspark.ml import linalg from pyspark.ml.feature import VectorAssembler # COMMAND ---------- vector = VectorAssembler(inputCols=[ 'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density', 'cruise_idx'
testing = testing.withColumn("Fare2", testing["Fare"].cast(DoubleType())) testing = testing.fillna(-1, subset=["Pclass2", "SibSp2", "Parch2", "Fare2"]) testing = testing.fillna(29.67, subset=["Age2"]) # COMMAND ---------- display(training) # COMMAND ---------- labelIndexer = StringIndexer(inputCol="Survived", outputCol="indexedLabel").fit(training) # COMMAND ---------- trainingFeatureTest = labelIndexer.transform(training) display(trainingFeatureTest.select("Survived", "indexedLabel")) # COMMAND ---------- featureIndexer1 = StringIndexer(inputCol="Sex", outputCol="feature1").fit(training) # COMMAND ---------- trainingFeatureTest = featureIndexer1.transform(trainingFeatureTest) display( trainingFeatureTest.select("Survived", "indexedLabel", "Sex", "feature1")) # COMMAND ----------
features = ['Price', 'Date of Transfer', 'Property Type', 'Old/New', 'Town/City', 'District', 'County'] data = data.select(features) # convert all selected string columns into integers date_indexer = StringIndexer(inputCol='Date of Transfer', outputCol='Date_of_TransferIndexed') date_indexer = date_indexer.fit(data) property_type_indexer = StringIndexer(inputCol='Property Type', outputCol='Property_typeIndexed') property_type_indexer = property_type_indexer.fit(data) olde_new_indexer = StringIndexer(inputCol='Old/New', outputCol='Old_NewIndexed') olde_new_indexer = olde_new_indexer.fit(data) town_indexer = StringIndexer(inputCol='Town/City', outputCol='TownIndexed') town_indexer = town_indexer.fit(data) district_indexer = StringIndexer(inputCol='District', outputCol='DistrictIndexed') district_indexer = district_indexer.fit(data) county_indexer = StringIndexer(inputCol='County', outputCol='CountyIndexed') county_indexer = county_indexer.fit(data) data = date_indexer.transform(data) data = property_type_indexer.transform(data) data = olde_new_indexer.transform(data) data = town_indexer.transform(data) data = district_indexer.transform(data) data = county_indexer.transform(data) data.show assembler=VectorAssembler(inputCols=['Date_of_TransferIndexed', 'CountyIndexed'],outputCol='features') output=assembler.transform(data) final_data=output.select('features','Price') train_data,test_data=final_data.randomSplit([0.7,0.3]) lr=LinearRegression(labelCol='Price') lr_model=lr.fit(train_data) # save results
# cached join data to reduce processing cached_join = weather_airline_joined.cache() # perform train/test split based on year train_set = filter_to_train(cached_join).cache() test_set = filter_to_test(cached_join).cache() # COMMAND ---------- # Index label labelIndexer = StringIndexer( inputCol="dep_del15", outputCol="label").setHandleInvalid("keep").fit(train_set) train_set = labelIndexer.transform(train_set) test_set = labelIndexer.transform(test_set) # Index features categorical = [ "month", "day_of_week", "op_unique_carrier", "Holiday", "PREVIOUS_FLIGHT_DELAYED_FOR_MODELS", "origin_WND_direction_angle", "origin_WND_type_code", "origin_CIG_ceiling_visibility_okay", "origin_VIS_variability", "dest_WND_direction_angle", "dest_WND_type_code", "dest_CIG_ceiling_visibility_okay", "dest_VIS_variability", "crs_dep_hour", 'distance_group', 'origin_airport_id' ] categorical_index = [i + "_Index" for i in categorical] stringIndexer = StringIndexer(
train_feature_df = feature_df.filter(feature_df['time'] <= split_time) test_feature_df = feature_df.filter(feature_df['time'] > split_time) train_feature_df = train_feature_df.drop('time') test_feature_df = test_feature_df.drop('time') assembler = VectorAssembler( inputCols=list(set(train_feature_df.columns) - set(['result', 'home_name', 'away_name'])), outputCol="features") train_df = assembler.transform(train_feature_df) test_df = assembler.transform(test_feature_df) labelIndexer = StringIndexer(inputCol="result", outputCol="indexedResult").fit(feature_df) train_df = labelIndexer.transform(train_df) test_df = labelIndexer.transform(test_df) label_mapping = dict(enumerate(labelIndexer.labels())) reverse_mapping = {} for key in label_mapping: reverse_mapping[label_mapping[key]] = key # ## Dimensionality reduction # # Feature selection is not really supported yet in mllib, therefore, we just applied dim reduction using PCA # In[509]: pca = PCA(inputCol="features", outputCol="pca", k=15).fit(train_df)
def applyModel(fileName, loadModelName, outlierPercentile = 100): sc = SparkContext( 'local', 'pyspark') sqlContext = SQLContext(sc) ######### # load data ######### data = sc.textFile(fileName) #extract header and remove it header = data.first() data = data.filter(lambda x:x !=header).cache() header = header.split('\t') #parse data data = data.map(lambda x : x.split('\t')) ######### # prepare features ######### df = sqlContext.createDataFrame(data, header) df = (df.withColumn("ADLOADINGTIME",func.regexp_replace('ADLOADINGTIME', 'null', '0').cast('float')) .withColumn("TIMESTAMP",func.regexp_replace('TIMESTAMP', 'null', '0').cast('int')) .withColumn("GEOIP_LAT",func.regexp_replace('GEOIP_LAT', 'null', '0').cast('int')) .withColumn("GEOIP_LNG",func.regexp_replace('GEOIP_LNG', 'null', '0').cast('int')) .withColumn("HOSTWINDOWHEIGHT",func.regexp_replace('HOSTWINDOWHEIGHT', 'null', '0').cast('int')) .withColumn("HOSTWINDOWWIDTH",func.regexp_replace('HOSTWINDOWWIDTH', 'null', '0').cast('int')) .withColumn("TOPMOSTREACHABLEWINDOWHEIGHT",func.regexp_replace('TOPMOSTREACHABLEWINDOWHEIGHT', 'null', '0').cast('int')) .withColumn("TOPMOSTREACHABLEWINDOWWIDTH",func.regexp_replace('TOPMOSTREACHABLEWINDOWWIDTH', 'null', '0').cast('int')) ) thr = np.percentile(df.select("ADLOADINGTIME").rdd.collect(), outlierPercentile) df = df.filter(func.col('ADLOADINGTIME') < thr) df = df.withColumn("TOPMOSTREACHABLEWINDOWAREA", func.col("TOPMOSTREACHABLEWINDOWHEIGHT")*func.col("TOPMOSTREACHABLEWINDOWWIDTH")) df = df.withColumn("INTENDENTISACTUALDEVICETYPE", (func.col("ACTUALDEVICETYPE")==func.col("INTENDEDDEVICETYPE")).cast('int')) df = df.withColumn("COMBINEDID", func.concat( func.col('ACCOUNTID'), func.col('CAMPAIGNID'), func.col('CREATIVEID'), func.col('SDK')) ) #df = df.withColumn("COMBINEDID", func.regexp_replace("COMBINEDID", '^$', 'NA')) df = df.withColumn("COMBINEDEXTERNALID", func.concat( func.regexp_replace('EXTERNALADSERVER', 'null', ''), func.regexp_replace('EXTERNALPLACEMENTID', 'null', ''), func.regexp_replace('EXTERNALSITEID', 'null', ''), func.regexp_replace('EXTERNALSUPPLIERID', 'null', '') )) #df = df.withColumn("COMBINEDEXTERNALID", func.regexp_replace("COMBINEDEXTERNALID", '^$', 'NA')) df = df.withColumn("PLATFORMCOMBINED", func.concat( func.regexp_replace('PLATFORM', 'null', ''), func.regexp_replace('PLATFORMVERSION', 'null', '') )) #df = df.withColumn("PLATFORMCOMBINED", func.regexp_replace("PLATFORMCOMBINED", '^$', 'NA')) df = df.withColumn("UA_OSCOMB", func.concat( func.regexp_replace('UA_OS', 'null', ''), func.regexp_replace('UA_OSVERSION', 'null', '') )) #df = df.withColumn("UA_OSCOMB", func.regexp_replace("UA_OSCOMB", '^$', 'NA')) df = df.withColumn("FILESJSON_SIZE", func.regexp_replace('FILESJSON', '[^,\d]', '') ) df = df.withColumn("FILESJSON_SIZE", func.regexp_replace('FILESJSON_SIZE', '^,', '') ) df = df.withColumn("FILESJSON_SIZE", func.regexp_replace('FILESJSON_SIZE', ',,', ',') ) udf = func.udf(lambda x: int(np.fromstring(x,dtype=int, sep=',').sum()), IntegerType()) df = df.withColumn("FILESJSON_SIZE", udf("FILESJSON_SIZE")) print('Loaded and prapared %d entries' % df.count()) ######### # keep only needed features ######### features = ['ADLOADINGTIME', 'PLACEMENTID', 'TIMESTAMP', 'CREATIVETYPE', 'UA_HARDWARETYPE', 'UA_VENDOR', 'UA_MODEL', 'UA_BROWSER', 'UA_BROWSERVERSION', 'FILESJSON', 'ERRORSJSON', 'TOPMOSTREACHABLEWINDOWAREA', 'FILESJSON_SIZE', 'COMBINEDID', 'COMBINEDEXTERNALID', 'PLATFORMCOMBINED', 'UA_OSCOMB', 'SDK', 'EXTERNALADSERVER' ] df = df.select(features) ######### # Convert categorical features to numerical ######### featuresCat = [ 'PLACEMENTID', 'CREATIVETYPE', 'UA_HARDWARETYPE', 'UA_VENDOR', 'UA_MODEL', 'UA_BROWSER', 'UA_BROWSERVERSION', 'FILESJSON', 'ERRORSJSON', 'COMBINEDID', 'COMBINEDEXTERNALID', 'PLATFORMCOMBINED', 'UA_OSCOMB', 'SDK', 'EXTERNALADSERVER' ] for i in range(len(featuresCat)): indexer = StringIndexer(inputCol=featuresCat[i], outputCol='_'+featuresCat[i]).setHandleInvalid("skip").fit(df) df = indexer.transform(df).drop(featuresCat[i]) writer = indexer._call_java("write") writer.overwrite().save("indexer_" + featuresCat[i]) featuresCat = [ '_' + featuresCat[i] for i in range(len(featuresCat))] features = featuresCat[:] features.append('TIMESTAMP') features.append('FILESJSON_SIZE') features.append('TOPMOSTREACHABLEWINDOWAREA') ######### # Assemble features ######### assembler = VectorAssembler( inputCols=features, outputCol="features") df = assembler.transform(df) ######### # Convert to labeled point ######### lp = (df.select(func.col("ADLOADINGTIME").alias("label"), func.col("features")) .map(lambda row: LabeledPoint(row.label, row.features))) lp.cache() ######### # Load trained model ######### model = RandomForestModel.load(sc, loadModelName) print('Model loaded!') predictions = model.predict(lp.map(lambda x: x.features)).collect() return predictions