def show_distinct_rows(request):
    # First, read the data
    data_df = read_df(request, 'test')
    json_df = data_df.toPandas()
    json_df.to_json()

    # Second, check the data for duplicate rows
    distinct_df = data_df.distinct(
    )  # Get the total number of distinct records

    # Remove any duplicates if they exist
    if data_df.count() != distinct_df.count():
        non_duplicates_df = data_df.dropDuplicates()
    else:
        non_duplicates_df = data_df

    model_data = DistinctRows.objects.create(total_count=data_df.count(),
                                             distinct_rows=distinct_df.count())
    model_data.save()

    context = {
        'all_data': json_df,
        'count': data_df.count(),
        'distinct_count': distinct_df.count(),
        'distinct_df': non_duplicates_df
    }
    return render(request, 'show_distinct_rows.html', context)
def show_missing_observations(request):
    # Read all the custom fields
    unique_fields = custom_fields(request)

    # First, read the data
    data_df = read_df(request, 'test')
    json_df = data_df.toPandas()
    json_df.to_json()

    # Get the Date column
    date_column = None
    for col in data_df.columns:
        if ('Date' in str(col)):
            date_column = col

    # Show the number of missing observations per row as a percentage
    df_percentage = data_df.agg(
        *[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing')
          for c in data_df.columns]).collect()
    df_percentage = df_percentage[0]

    # Drop the rows whose missing observations exceed a certain threshold
    data_less_rows = data_df.dropna(thresh=3)

    # Calculate the mean for every non boolean field
    means = data_less_rows.agg(*[
        fn.mean(c).alias(c) for c in data_less_rows.columns if c != date_column
    ]).toPandas().to_dict('records')[0]

    means[date_column] = 'missing'
    for key, value in means.items():
        if (means[key] == None):
            means[key] = 'missing'

    # Fill the empty observations with the mean of the column
    data_less_rows.fillna(means)

    clean_json_df = data_less_rows.toPandas()
    clean_json_df.to_json()

    columns = data_less_rows.columns
    data = []

    for item in df_percentage:
        data.append(item)

    columns_json = json.dumps(columns)
    data_json = json.dumps(data)

    model_data = MissingObservations.objects.create(
        missing_columns=columns_json, missing_data=data_json)
    model_data.save()

    context = {
        'all_data': json_df,
        'missing_percentages': df_percentage,
        'clean_data': clean_json_df
    }
    return render(request, 'missing_observations.html', context)
def show_distinct_ids(request):
    # Read all the custom fields
    unique_fields = custom_fields(request)

    # First, read the data
    data_df = read_df(request, 'test')
    json_df = data_df.toPandas()
    json_df.to_json()

    # Second, check data for duplicates despite the ID attribute
    distinct_rows = data_df.select([
        c for c in data_df.columns if c != unique_fields['index']
    ]).distinct()

    # Drop rows that are similar but may have different ids
    if data_df.count() != distinct_rows.count():
        unique_rows_df = data_df.dropDuplicates(
            subset=[c for c in data_df.columns if c != unique_fields['index']])
    else:
        unique_rows_df = data_df

    # Count the number of distinct id fields
    distinct_ids = unique_rows_df.agg(
        fn.countDistinct(unique_fields['index']).alias('distinct'))
    ids = distinct_ids.select('distinct').collect()
    distinct = ids[0].distinct

    # This means there are rows with same id(s) but are not duplicates
    if unique_rows_df.count() != distinct:
        clean_df = unique_rows_df.withColumn('New_id',
                                             fn.monotonically_increasing_id())
    else:
        clean_df = unique_rows_df

    clean_json = clean_df.toPandas()
    clean_json.to_json()

    model_data = DistinctIds.objects.create(total_ids=data_df.count(),
                                            distinct_ids=distinct)
    model_data.save()

    context = {
        'all_data': json_df,
        'distinct_ids': distinct,
        'clean_df': clean_json
    }
    return render(request, 'show_distinct_ids.html', context)
def cluster(request):
    unique_fields = custom_fields(request)
    # First, read the data
    data_df = read_df(request, 'clean')
    data_df.cache()
    json_df = data_df.toPandas()
    json_df.to_json()

    # Create a tuple of id and items from the Data Frame
    dd = []
    for p in data_df:
        dd.append(p)

    data = []
    for row in json_df.itertuples():
        id = row[1]
        items = []

        for column in range(2, (len(dd) + 1)):
            items.append(row[column])
        data.append((id, items))

    # Create a Data Frame from the data dictionary
    final_data = Spark.sqlContext.createDataFrame(data, ["id", "items"])

    # Create the FPGrowth instance with its arguments and train the model
    fpGrowth = FPGrowth(itemsCol='items', minSupport=0.5, minConfidence=0.6)
    model = fpGrowth.fit(final_data)

    # Frequent Item sets
    itemSets = model.freqItemsets

    # Generated Association Rules
    assocRules = model.associationRules

    # Examines input items against all association rules and summarize consequents as prediction
    prediction = model.transform(data)

    context = {
        'all_data': json_df,
        'itemSets': itemSets,
        'assocRules': assocRules,
        'predicted': prediction
    }
    return render(request, 'show_clusters.html', context)
def pre_process(request):
    # Read all the custom fields
    unique_fields = custom_fields(request)
    company_name = request.user.project.company
    created_by = request.user
    project = request.user.project

    # First, read the data
    data_df = read_df(request, 'test')
    json_df = data_df.toPandas()
    json_df.to_json()

    # Get the Date column
    date_column = None
    for col in data_df.columns:
        if ('Date' in str(col)):
            date_column = col


# Second, check the data for duplicate rows
    distinct_df = data_df.distinct(
    )  # Get the total number of distinct records

    # Remove any duplicates if they exist
    if data_df.count() != distinct_df.count():
        non_duplicates_df = data_df.dropDuplicates()
    else:
        non_duplicates_df = data_df

    # Third, check data for duplicates despite the ID attribute
    distinct_rows = non_duplicates_df.select([
        c for c in non_duplicates_df.columns if c != unique_fields['index']
    ]).distinct()

    # Drop rows that are similar but may have different ids
    if non_duplicates_df.count() != distinct_rows.count():
        unique_rows_df = non_duplicates_df.dropDuplicates(subset=[
            c for c in non_duplicates_df.columns if c != unique_fields['index']
        ])
    else:
        unique_rows_df = non_duplicates_df

    # Count the number of distinct id fields
    distinct_ids = unique_rows_df.agg(
        fn.countDistinct(unique_fields['index']).alias('distinct'))
    ids = distinct_ids.select('distinct').collect()
    distinct = ids[0].distinct

    # This means there are rows with same id(s) but are not duplicates
    if unique_rows_df.count() != distinct:
        clean_df = unique_rows_df.withColumn('New_id',
                                             fn.monotonically_increasing_id())
    else:
        clean_df = unique_rows_df

    # Show the number of missing observations per row as a percentage
    df_percentage = clean_df.agg(
        *[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing')
          for c in clean_df.columns]).collect()

    # To save to db
    df = str(df_percentage)
    new = df.replace("[", "").replace("Row", "").replace("(", "").replace(
        "]", "").replace(")", "").split(",")
    new = json.dumps(new)

    # Can instead use correlation to check which attributes to drop

    # Drop the rows whose missing observations exceed a certain threshold
    data_less_rows = clean_df.dropna(thresh=3)

    # Calculate the mean for every non boolean field
    means = data_less_rows.agg(*[
        fn.mean(c).alias(c) for c in data_less_rows.columns if c != date_column
    ]).toPandas().to_dict('records')[0]

    means[date_column] = 'missing'
    for key, value in means.items():
        if (means[key] == None):
            means[key] = 'missing'
    try:
        means.pop(None)
    except:
        pass
    data_less_rows.fillna(means)

    clean_json_df = data_less_rows.toPandas()
    clean_json_df.to_json()

    table_name = str(company_name) + '_Clean'

    total_rows = data_df.count()
    distinct_rows = distinct_rows.count()

    # Save the data to the database for easier reuse
    DistinctRows.objects.create(total_count=total_rows,
                                distinct_rows=distinct_rows,
                                created_by=created_by,
                                project=project)

    DistinctIds.objects.create(total_ids=total_rows,
                               distinct_ids=distinct,
                               created_by=created_by,
                               project=project)

    MissingObservations.objects.create(missing_columns=new,
                                       created_by=created_by,
                                       project=project)

    # Final step is to save the pre_processed DF to the DB
    data_less_rows.write.format('jdbc').options(
        url='jdbc:mysql://localhost:3306/disease',
        dbtable=table_name,
        user='******',
        password='******').mode('append').save()

    context = {
        'all_data': json_df,
        'rows_count': total_rows,
        'distinct_rows': distinct_rows,
        'distinct_rows_without_id': distinct_rows,
        'distinct_ids': distinct,
        'missing_percentages': new,
        'clean_data': clean_json_df
    }
    return render(request, 'show_distinct.html', context)
Exemplo n.º 6
0
def pipeline(request):
    unique_fields = custom_fields(request)
    date_column = CustomFields.objects.first()
    date_column = date_column.date_column

    # First, read the data
    data_df = read_df(request, 'clean')
    json_df = data_df.toPandas()
    json_df.to_json()

    # Cast all the columns to numeric
    new_df = data_df.select(
        [col(c).cast("double").alias(c) for c in data_df.columns])
    new_df = new_df.fillna(0.0)
    new_df.show()

    # Split data into training and test sets
    train, test = new_df.randomSplit([0.7, 0.3])

    # Feature Processing
    featuresCols = new_df.columns
    featuresCols.remove(unique_fields['prediction'])

    try:
        featuresCols.remove(date_column)
    except:
        pass

    # This concatenates all feature columns into a single feature vector in a new column 'rawFeatures'
    vectorAssembler = VectorAssembler(inputCols=featuresCols,
                                      outputCol='rawFeatures')

    # Model Training
    standardScaler = StandardScaler(inputCol="rawFeatures",
                                    outputCol="features")
    lr = LinearRegression(labelCol=unique_fields['prediction'],
                          maxIter=10,
                          regParam=.01)

    # Model tuning
    paramGrid = ParamGridBuilder() \
        .addGrid(lr.maxIter, [10, 100, 1000]) \
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .addGrid(lr.fitIntercept, [False, True]) \
        .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
        .build()

    # We define an evaluation metric.
    # This tells CrossValidator how well we are doing by comparing the true labels with predictions
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol=lr.getLabelCol(),
                                    predictionCol=lr.getPredictionCol())

    # Declare the CrossValidator which runs model tuning for us.
    cv = CrossValidator(estimator=lr,
                        evaluator=evaluator,
                        estimatorParamMaps=paramGrid)

    stages = [vectorAssembler, standardScaler, cv]

    # Train the pipeline
    pipeline = Pipeline(stages=stages)

    model = pipeline.fit(train)
    predictions = model.transform(test)

    rmse = evaluator.evaluate(predictions)
    print("RMSE on our test set is: " + str(rmse))

    predictions.show()

    predicted_df = predictions.toPandas()
    predicted_df.to_json()
    # rmse = 23
    context = {'all_data': json_df, 'rmse': rmse, 'predicted': predicted_df}
    return render(request, 'show_predictions.html', context)