def get_telemetry_crashes(sc, versions, days, product='Firefox'): days = utils.get_days(days) dataset = SQLContext(sc).read.load(['s3://telemetry-parquet/socorro_crash/v2/crash_date=' + day.strftime('%Y%m%d') for day in days], 'parquet') if product != 'FennecAndroid': dataset = dataset.select([c for c in dataset.columns if c not in [ 'android_board', 'android_brand', 'android_cpu_abi', 'android_cpu_abi2', 'android_device', 'android_hardware', 'android_manufacturer', 'android_model', 'android_version', ]]) return dataset.filter((dataset['product'] == product) & (dataset['version'].isin(versions)))
def load_dataFrame_from_csv(self, csvFilePath): schema = StructType([ StructField("X2", StringType(), True), StructField("X4", StringType(), True), StructField("X5", StringType(), True), StructField("X6", StringType(), True), StructField("adversaire", StringType(), True), StructField("score_france", IntegerType(), True), StructField("score_adversaire", IntegerType(), True), StructField("penalty_france", StringType(), True), StructField("penalty_adversaire", StringType(), True), StructField("date", DateType(), True), StructField("year", IntegerType(), True), StructField("outcome", StringType(), True), StructField("no", StringType(), True) ]) dfNotFiltered = SQLContext(self.spark).read.csv(csvFilePath, header=True,schema=schema) return dfNotFiltered.filter(dfNotFiltered.no != "None")