示例#1
0
parser = argparse.ArgumentParser("prep_data")
parser.add_argument("--input_file", type=str, help="input raw data file")
parser.add_argument("--output_path", type=str, help="output prepped data path")

args, unknown = parser.parse_known_args()
if (unknown):
    print("Unknown args:")
    print(unknown)

print("Argument 1 (input training data file): %s" % args.input_file)
print("Argument 2 (output prepped training data path) %s" % args.output_path)

input_file = dprep.read_csv(args.input_file)

prepped_data = (
    input_file.drop_columns(
        columns='skin'
    )  # skin is same as thickness with another unit (inches/cm)
    .replace(columns='diabetes', find="TRUE", replace_with="1").replace(
        columns='diabetes', find="FALSE", replace_with="0").set_column_types(
            type_conversions={
                'diabetes': dprep.TypeConverter(
                    data_type=dprep.FieldType.INTEGER)
            }))

if not (args.output_path is None):
    os.makedirs(args.output_path, exist_ok=True)
    print("%s created" % args.output_path)
    write_df = prepped_data.write_to_csv(
        directory_path=dprep.LocalFileOutput(args.output_path))
    write_df.run_local()
示例#2
0
                    type=str,
                    help="filter out out of city locations")

args = parser.parse_args()

print("Argument 1(input taxi data path): %s" % args.input_filter)
print("Argument 2(output filtered taxi data path): %s" % args.output_filter)

combined_df = dprep.read_csv(args.input_filter + '/part-*')

# These functions filter out coordinates for locations that are outside the city border.
# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep for more details

# Create a condensed view of the dataflow to just show the lat/long fields,
# which makes it easier to evaluate missing or out-of-scope coordinates
decimal_type = dprep.TypeConverter(data_type=dprep.FieldType.DECIMAL)
combined_df = combined_df.set_column_types(
    type_conversions={
        "pickup_longitude": decimal_type,
        "pickup_latitude": decimal_type,
        "dropoff_longitude": decimal_type,
        "dropoff_latitude": decimal_type
    })

# Filter out coordinates for locations that are outside the city border.
# Chain the column filter commands within the filter() function
# and define the minimum and maximum bounds for each field
latlong_filtered_df = (combined_df.drop_nulls(
    columns=[
        "pickup_longitude", "pickup_latitude", "dropoff_longitude",
        "dropoff_latitude"
示例#3
0
asset_data_path = 'data/AssetData_Historical.csv'
asset_data_df = dprep.read_csv(path=asset_data_path,
                               header=dprep.PromoteHeadersMode.GROUPED)
display(asset_data_df.head(5))
#%%
dprep_path = os.path.join(os.getcwd(), 'dflows.dprep')
dflow_prepared = asset_data_df
package = dprep.Package([dflow_prepared])
package.save(dprep_path)
#%%
package_saved = dprep.Package.open(dprep_path)
dflow_prepared = package_saved.dataflows[0]
dflow_prepared.get_profile()

#%%
int_type = dprep.TypeConverter(dprep.FieldType.INTEGER)
dflow_prepared = dflow_prepared.set_column_types(
    type_conversions={'Failure_NextHour': int_type})

dflow_prepared = dflow_prepared.to_number([
    'Density_Overload', 'Abnormal_Flow_Rate', 'Heat_Flow', 'Asset_Integrity',
    'Temperature_Differential', 'Volumetric_Flow_Rate', 'Tangential_Stress',
    'Duct_Lenghts_in_Units', 'Fault_in_last_Month', 'Avg_hours_in_Use',
    'Pressure_Alarm', 'Inclination_Angle', 'Operating_Pressure_above_Normal',
    'Compression_Ratio', 'Multiple_Connects', 'Water_Exposure_units',
    'Humidity_Factor', 'Cathodic_Protection', 'Pressure_Class'
])

display(dflow_prepared.head(5))

#%%