예제 #1
0
import sklearn.metrics
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.storagelevel import StorageLevel

## CUSTOM IMPORT
import conf
from src import american_community_survey as amc
from src import utils
from src import download_spark

## START
# Initiate the parser
args = utils.get_argparser().parse_args()

utils.printNowToFile("starting:")

utils.printNowToFile("downloading spark")
download_spark.download(os.getcwd())

###############################################################
if args.host and args.port:
    spark = conf.load_conf(args.host, args.port)
else:
    spark = conf.load_conf_default()

spark.sparkContext.addPyFile('ridge_regression.py')
import ridge_regression as rr

## PREPROCESSING: CLEANING
## path to dataset
def load_dataset(DATA_PATH, spark):

    kaggle.api.authenticate()

    # (unzip=False otherwise the dataset is downloaded entirely every time)
    kaggle.api.dataset_download_files('census/2013-american-community-survey',
                                      path=DATA_PATH,
                                      force=False,
                                      quiet=False,
                                      unzip=False)

    # csv_files is the list of csv exracted from the dataset
    csv_files = [x for x in os.listdir(DATA_PATH) if 'csv' in x]

    # if dataset not already unzipped, unzip it
    if not csv_files:
        with zipfile.ZipFile(DATA_PATH + '/2013-american-community-survey.zip',
                             'r') as zip_ref:
            zip_ref.extractall(DATA_PATH)
    #del csv_files

    #dataframe people dataset
    pfiles = ["ss13pusa.csv", "ss13pusb.csv"]
    hfiles = ["ss13husa.csv", "ss13husb.csv"]
    df_p = spark.read.csv([DATA_PATH + '/' + f for f in pfiles],
                          header=True,
                          inferSchema=True)
    df_h = spark.read.csv([DATA_PATH + '/' + f for f in hfiles],
                          header=True,
                          inferSchema=True)

    # drop columns in housing and person
    dropping_list = [
        'PERNP', 'WAGP', 'HINCP', 'FINCP', 'RT', 'DIVISION', 'REGION',
        'ADJINC', 'ADJHSG', 'WGTP', 'PWGTP', 'SPORDER', 'VACS'
    ]
    #
    join_list = ['SERIALNO', 'PUMA', 'ST']

    df_p = df_p.drop(*dropping_list)
    df_h = df_h.drop(*dropping_list)

    col_p = df_p.columns
    col_h = df_h.columns

    #join dei due dataframe
    utils.printNowToFile("join df started:")
    df = df_p.join(df_h, on=join_list, how='inner')
    utils.printNowToFile("join df end:")

    del df_h
    del df_p

    df = df.drop('PUMA')

    #drop colonna totalmente null
    vacs = ['VACS']
    df = df.drop(*vacs)

    df = df.drop('SERIALNO')

    weight_list_p = df.select(df.colRegex("`(pwgtp)+?.+`"))
    weight_list_h = df.select(df.colRegex("`(wgtp)+?.+`"))
    flag_list = df.select(
        df.colRegex("`(?!FOD1P|FOD2P|FIBEROP|FULP|FPARC|FINCP)(F)+?.+(P)`"))

    df = df.drop(*weight_list_p.schema.names)
    df = df.drop(*weight_list_h.schema.names)
    df = df.drop(*flag_list.schema.names)

    return df
import sklearn.metrics
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.storagelevel import StorageLevel

## CUSTOM IMPORT
import conf
from src import american_community_survey as amc
from src import utils
from src import download_spark

## START
# Initiate the parser
args = utils.get_argparser().parse_args()

utils.printNowToFile("starting:")

utils.printNowToFile("downloading spark")
download_spark.download(os.getcwd())

###############################################################
if args.host and args.port:
    spark = conf.load_conf(args.host, args.port)
else:
    spark = conf.load_conf_default()

spark.sparkContext.addPyFile('ridge_regression.py')
import ridge_regression as rr

## PREPROCESSING: CLEANING
## path to dataset