def compute_relationships(project_id):
    all_datasets = db_access.get_datasets(project_id)
    relationships = []

    if len(all_datasets) == 1:
        return relationships

    for dataset_a, dataset_b in combinations(all_datasets, 2):
        dataset_a_fields = db_access.get_field_properties(
            project_id, dataset_a['id'])
        dataset_b_fields = db_access.get_field_properties(
            project_id, dataset_b['id'])

        for index_a, field_a in enumerate(dataset_a_fields):
            for index_b, field_b in enumerate(dataset_b_fields):
                logger.info('%s:%s - %s:%s', dataset_a['title'],
                            field_a['name'], dataset_b['title'],
                            field_b['name'])
                unique_field_a_values = field_a.get('unique_values')
                unique_field_b_values = field_b.get('unique_values')

                if (not unique_field_a_values) or (not unique_field_b_values):
                    continue

                len_a = len(unique_field_a_values)
                len_b = len(unique_field_b_values)

                d = get_distance(unique_field_a_values, unique_field_b_values)
                logger.info('%s-%s: %s', field_a['name'], field_b['name'], d)

                if d >= THRESHOLD:
                    if len_a == len_b:
                        relationship_type = "11"
                    elif (len_a > len_b):
                        relationship_type = "N1"
                    elif (len_a < len_a):
                        relationship_type = "1N"
                    else:
                        relationship_type = None
                else:
                    continue

                relationship = {
                    'source_dataset_id': dataset_a['id'],
                    'source_field_id': field_a['id'],
                    'target_dataset_id': dataset_b['id'],
                    'target_field_id': field_b['id'],
                    'source_dataset_name': dataset_a['title'],
                    'source_field_name': field_a['name'],
                    'target_dataset_name': dataset_b['title'],
                    'target_field_name': field_b['name'],
                    'distance': d,
                    'type': relationship_type
                }
                relationships.append(relationship)

        return relationships
示例#2
0
def compute_relationships(project_id):
    all_datasets = db_access.get_datasets(project_id)
    relationships = []

    if len(all_datasets) == 1:
        return relationships

    for dataset_a, dataset_b in combinations(all_datasets, 2):
        dataset_a_fields = db_access.get_field_properties(project_id, dataset_a['id'])
        dataset_b_fields = db_access.get_field_properties(project_id, dataset_b['id'])

        for index_a, field_a in enumerate(dataset_a_fields):
            for index_b, field_b in enumerate(dataset_b_fields):
                logger.info('%s:%s - %s:%s', dataset_a['title'], field_a['name'], dataset_b['title'], field_b['name'])
                unique_field_a_values = field_a.get('unique_values')
                unique_field_b_values = field_b.get('unique_values')

                if (not unique_field_a_values) or (not unique_field_b_values):
                    continue

                len_a = len(unique_field_a_values)
                len_b = len(unique_field_b_values)

                d = get_distance(unique_field_a_values, unique_field_b_values)
                logger.info('%s-%s: %s', field_a['name'], field_b['name'], d)

                if d >= THRESHOLD:
                    if len_a == len_b:
                        relationship_type = "11"
                    elif (len_a > len_b):
                        relationship_type = "N1"
                    elif (len_a < len_a):
                        relationship_type = "1N"
                    else:
                        relationship_type = None
                else:
                    continue

                relationship = {
                    'source_dataset_id': dataset_a['id'],
                    'source_field_id': field_a['id'],
                    'target_dataset_id': dataset_b['id'],
                    'target_field_id': field_b['id'],
                    'source_dataset_name': dataset_a['title'],
                    'source_field_name': field_a['name'],
                    'target_dataset_name': dataset_b['title'],
                    'target_field_name': field_b['name'],
                    'distance': d,
                    'type': relationship_type
                }
                relationships.append(relationship)

        return relationships
示例#3
0
def load_data(dependent_variable_name, independent_variables_names,
              interaction_term_ids, dataset_id, project_id):
    '''
    Load DF and full field documents
    '''
    # Map variables to field documents
    all_fields = db_access.get_field_properties(project_id, dataset_id)
    interaction_terms = db_access.get_interaction_term_properties(
        interaction_term_ids)
    dependent_variable = next(
        (f for f in all_fields if f['name'] == dependent_variable_name), None)

    independent_variables = []
    if independent_variables_names:
        independent_variables = get_full_field_documents_from_field_names(
            all_fields, independent_variables_names)
    else:
        for field in all_fields:
            if (not (field['general_type'] == 'c' and field['is_unique']) \
                and field['name'] != dependent_variable_name):
                independent_variables.append(field)

    # 2) Access dataset
    df = get_data(project_id=project_id, dataset_id=dataset_id)

    # Drop NAs
    df_subset = df[[dependent_variable_name] + independent_variables_names]
    df_ready = df_subset.dropna(axis=0, how='all')

    return dependent_variable, independent_variables, interaction_terms, df_ready
示例#4
0
def save_field_properties(all_properties_result, dataset_id, project_id):
    ''' Upsert all field properties corresponding to a dataset '''
    logger.debug(
        'In save_field_properties for dataset_id %s and project_id %s',
        dataset_id, project_id)

    all_properties = all_properties_result['result']
    field_properties_with_id = []
    for field_properties in all_properties:
        name = field_properties['name']

        existing_field_properties = db_access.get_field_properties(project_id,
                                                                   dataset_id,
                                                                   name=name)

        if existing_field_properties:
            field_properties = db_access.update_field_properties(
                project_id, dataset_id, **field_properties)
        else:
            field_properties = db_access.insert_field_properties(
                project_id, dataset_id, **field_properties)
        field_properties_with_id.append(field_properties)
    return {
        'desc': 'Saved %s field properties' % len(field_properties_with_id),
        'result': {
            'id': dataset_id
        }
    }
示例#5
0
    def get(self):
        args = fieldPropertiesGetParser.parse_args()
        project_id = args.get('project_id')
        dataset_id = args.get('dataset_id')
        group_by = args.get('group_by')

        has_project_access, auth_message = project_auth(project_id)
        if not has_project_access: return auth_message

        field_properties = db_access.get_field_properties(
            project_id, dataset_id)
        interaction_terms = db_access.get_interaction_terms(
            project_id, dataset_id)

        if group_by:
            result = {}
            for fp in field_properties:
                fp_group_by = fp[group_by]
                if fp_group_by in result:
                    result[fp_group_by].append(fp)
                else:
                    result[fp_group_by] = [fp]
        else:
            result = {'field_properties': field_properties}

        result['interactionTerms'] = interaction_terms
        return make_response(jsonify(result))
def get_initial_regression_model_recommendation(project_id, dataset_id, dependent_variable_id=None, recommendation_type=MRT.LASSO.value, table_layout=MCT.LEAVE_ONE_OUT.value, data_size_cutoff=current_app.config['ANALYSIS_DATA_SIZE_CUTOFF'], categorical_value_limit=current_app.config['ANALYSIS_CATEGORICAL_VALUE_LIMIT']):
    df = get_data(project_id=project_id, dataset_id=dataset_id)
    if len(df) > data_size_cutoff:
        df = df.sample(data_size_cutoff)
    field_properties = db_access.get_field_properties(project_id, dataset_id)
    quantitative_field_properties = [ fp for fp in field_properties if fp['general_type'] == 'q']

    dependent_variable = next((f for f in field_properties if f['id'] == dependent_variable_id), None) \
        if dependent_variable_id \
        else np.random.choice(quantitative_field_properties, size=1)[0]

    independent_variables = []
    for fp in field_properties:
        if (fp['name'] != dependent_variable['name']):
            if (fp['general_type'] == 'c' and (fp['is_unique'] or len(fp['unique_values']) > categorical_value_limit)):
                continue
            independent_variables.append(fp)

    recommendationTypeToFunction = {
        MRT.FORWARD_R2.value: forward_r2,
        MRT.LASSO.value: lasso,
        MRT.RFE.value: recursive_feature_elimination,
        MRT.FORWARD_F.value: f_regression
    }

    result = recommendationTypeToFunction[recommendation_type](df, dependent_variable, independent_variables)

    return {
        'recommended': True,
        'table_layout': table_layout,
        'recommendation_type': recommendation_type,
        'dependent_variable_id': dependent_variable['id'],
        'independent_variables_ids': [ x['id'] for x in result ],
    }
示例#7
0
def load_data(dependent_variable_name, independent_variables_names, interaction_term_ids, dataset_id, project_id):
    '''
    Load DF and full field documents
    '''
    # Map variables to field documents
    all_fields = db_access.get_field_properties(project_id, dataset_id)
    interaction_terms = db_access.get_interaction_term_properties(interaction_term_ids)
    dependent_variable = next((f for f in all_fields if f['name'] == dependent_variable_name), None)

    independent_variables = []
    if independent_variables_names:
        independent_variables = get_full_field_documents_from_field_names(all_fields, independent_variables_names)
    else:
        for field in all_fields:
            if (not (field['general_type'] == 'c' and field['is_unique']) \
                and field['name'] != dependent_variable_name):
                independent_variables.append(field)

    # 2) Access dataset
    df = get_data(project_id=project_id, dataset_id=dataset_id)

    # Drop NAs
    df_subset = df[[dependent_variable_name] + independent_variables_names]
    df_ready = df_subset.dropna(axis=0, how='all')

    return dependent_variable, independent_variables, interaction_terms, df_ready
示例#8
0
def run_aggregation_from_spec(spec, project_id, config={}, conditionals=[]):
    aggregation_variables_names = spec.get('aggregationVariablesNames')
    dataset_id = spec.get('datasetId')
    dependent_variable_name = spec.get('dependentVariableName')
    weight_variable_name = config.get('weightVariableName')
    num_variables = len(aggregation_variables_names)

    if not (dataset_id): return 'Not passed required parameters', 400

    all_field_properties = db_access.get_field_properties(project_id, dataset_id)
    aggregation_variables = [ next((fp for fp in all_field_properties if fp['name'] == n), None) for n in aggregation_variables_names ]
    dependent_variable = next((fp for fp in all_field_properties if fp['name'] == dependent_variable_name), None)

    subset_variables = aggregation_variables_names
    if dependent_variable_name and dependent_variable_name != 'count':
        subset_variables += [ dependent_variable_name ]
    if weight_variable_name and weight_variable_name != 'UNIFORM':
        subset_variables += [ weight_variable_name ]
    subset_variables = get_unique(subset_variables, preserve_order=True)

    df = get_data(project_id=project_id, dataset_id=dataset_id)
    df_conditioned = get_conditioned_data(project_id, dataset_id, df, conditionals)
    df_subset = df_conditioned[ subset_variables ]
    df_ready = df_subset.dropna(how='all')  # Remove unclean

    result = {}
    if num_variables == 1:
        result['one_dimensional_contingency_table'] = create_one_dimensional_contingency_table(df_ready, aggregation_variables[0], dependent_variable, config=config)
    elif num_variables == 2:
        result['two_dimensional_contingency_table'] = create_contingency_table(df_ready, aggregation_variables, dependent_variable, config=config)

    return result, 200
示例#9
0
def get_conditioned_data(project_id, dataset_id, df, conditional_arg):
    '''
    Given a data frame and a conditional dict ({ and: [{field_id, operation,
    criteria}], or: [...]}).

    Return the conditioned data frame in same dimensions as original.

    TODO Turn this into an argument of the get_data function
    '''
    full_conditional = {}

    and_clause_list = conditional_arg.get('and')
    or_clause_list = conditional_arg.get('or')
    if not (and_clause_list or or_clause_list):
        return df

    desired_keys = ['general_type', 'name', 'id']
    raw_field_properties = db_access.get_field_properties(project_id, dataset_id)
    all_field_properties = [{ k: field[k] for k in desired_keys } for field in raw_field_properties]

    query_strings = { 'and': '', 'or': '' }

    orig_cols = df.columns.tolist()
    safe_df = df.rename(columns=make_safe_string)


    if and_clause_list:
        for c in and_clause_list:
            field = next((field for field in all_field_properties if c['field_id'] == field['id']), None)
            if field and c['criteria'] is not None:
                clause = _construct_conditional_clause(field, c['operation'], c['criteria'])
                query_strings['and'] = query_strings['and'] + ' & ' + clause

    if or_clause_list:
        for c in or_clause_list:
            field = next((field for field in all_field_properties if c['field_id'] == field['id']), None)
            if field and c['criteria'] is not None:
                clause = _construct_conditional_clause(field, c['operation'], c['criteria'])
                query_strings['or'] = query_strings['or'] + ' | ' + clause

    query_strings['and'] = query_strings['and'].strip(' & ')
    query_strings['or'] = query_strings['or'].strip(' | ')

    # Concatenate
    final_query_string = ''
    if query_strings['and'] and query_strings['or']:
        final_query_string = '%s | %s' % (query_strings['and'], query_strings['or'])
    elif query_strings['and'] and not query_strings['or']:
        final_query_string = query_strings['and']
    elif query_strings['or'] and not query_strings['and']:
        final_query_string = query_strings['or']

    if not final_query_string:
        return df

    conditioned_df = safe_df.query(final_query_string)
    conditioned_df.columns = orig_cols

    return conditioned_df
示例#10
0
def get_conditioned_data(project_id, dataset_id, df, conditional_arg):
    '''
    Given a data frame and a conditional dict ({ and: [{field_id, operation,
    criteria}], or: [...]}).

    Return the conditioned data frame in same dimensions as original.

    TODO Turn this into an argument of the get_data function
    '''
    full_conditional = {}

    and_clause_list = conditional_arg.get('and')
    or_clause_list = conditional_arg.get('or')
    if not (and_clause_list or or_clause_list):
        return df

    desired_keys = ['general_type', 'name', 'id']
    raw_field_properties = db_access.get_field_properties(project_id, dataset_id)
    all_field_properties = [{ k: field[k] for k in desired_keys } for field in raw_field_properties]

    query_strings = { 'and': '', 'or': '' }

    orig_cols = df.columns.tolist()
    safe_df = df.rename(columns=make_safe_string)


    if and_clause_list:
        for c in and_clause_list:
            field = next((field for field in all_field_properties if c['field_id'] == field['id']), None)
            if field and c['criteria'] is not None:
                clause = _construct_conditional_clause(field, c['operation'], c['criteria'])
                query_strings['and'] = query_strings['and'] + ' & ' + clause

    if or_clause_list:
        for c in or_clause_list:
            field = next((field for field in all_field_properties if c['field_id'] == field['id']), None)
            if field and c['criteria'] is not None:
                clause = _construct_conditional_clause(field, c['operation'], c['criteria'])
                query_strings['or'] = query_strings['or'] + ' | ' + clause

    query_strings['and'] = query_strings['and'].strip(' & ')
    query_strings['or'] = query_strings['or'].strip(' | ')

    # Concatenate
    final_query_string = ''
    if query_strings['and'] and query_strings['or']:
        final_query_string = '%s | %s' % (query_strings['and'], query_strings['or'])
    elif query_strings['and'] and not query_strings['or']:
        final_query_string = query_strings['and']
    elif query_strings['or'] and not query_strings['and']:
        final_query_string = query_strings['or']

    if not final_query_string:
        return df

    conditioned_df = safe_df.query(final_query_string)
    conditioned_df.columns = orig_cols

    return conditioned_df
示例#11
0
def get_data(project_id=None, dataset_id=None, nrows=None, field_properties=[]):
    if IMD.hasData(dataset_id):
        logger.debug('Accessing from IMD, project_id: %s, dataset_id: %s', project_id, dataset_id)
        df = IMD.getData(dataset_id)
        return df

    dataset = db_access.get_dataset(project_id, dataset_id)
    print(dataset)
    dialect = dataset['dialect']
    encoding = dataset.get('encoding', 'utf-8')

    if dataset['storage_type'] == 's3':
        if dataset['preloaded']:
            file_obj = s3_client.get_object(
                Bucket=current_app.config['AWS_DATA_BUCKET'],
                Key="-1/%s" % dataset['file_name']
            )
        else:
            file_obj = s3_client.get_object(
                Bucket=current_app.config['AWS_DATA_BUCKET'],
                Key="%s/%s" % (str(project_id), dataset['file_name'])
            )
        accessor = file_obj['Body']

    if dataset['storage_type'] == 'file':
        accessor = dataset['path']

    if not field_properties:
        field_properties = db_access.get_field_properties(project_id, dataset_id)

    # dive-la debug
    print('accessor:', accessor)
    import os
    print('folder contents', os.listdir("/usr/src/app/uploads/1/"))

    print('now pd read table')

    df = pd.read_table(
        accessor,
        error_bad_lines = False,
        encoding = encoding,
        skiprows = dataset['offset'],
        sep = dialect['delimiter'],
        engine = 'c',
        # dtype = field_to_type_mapping,
        escapechar = dialect['escapechar'],
        doublequote = dialect['doublequote'],
        quotechar = dialect['quotechar'],
        parse_dates = True,
        nrows = nrows,
        thousands = ','
    )
    sanitized_df = sanitize_df(df)
    coerced_df = coerce_types(sanitized_df, field_properties)

    IMD.insertData(dataset_id, coerced_df)
    return coerced_df
示例#12
0
def get_viz_data_from_enumerated_spec(spec, project_id, conditionals, config, df=None, precomputed={}, data_formats=['visualize', 'table', 'score']):
    '''
    Returns a dictionary containing data corresponding to spec (in automated-viz
    structure), and all necessary information to interpret data.

    There are three types of formats:
        Score: a dict of lists for scoring
        Visualize: a list of dicts (collection)
        Table: {columns: list, data: matrix}

    Args:
    spec, dataset_id, project_id, format (list of 'score', 'visualize', or 'table')
    Returns:
        data specified by spec, in specified format

    '''
    for f in data_formats:
        if f not in [u'score', u'visualize', u'table', u'count']:
            raise ValueError('Passed incorrect data format', f)
    final_data = dict([(f, {}) for f in data_formats])

    gp = spec['generating_procedure']
    args = spec['args']
    dataset_id = spec['dataset_id']

    logger.debug('Generating Procedure: %s', gp)
    logger.debug('Arguments: %s', args)
    start_time = time()

    if df is None:
        df = get_data(project_id=project_id, dataset_id=dataset_id)
        df = get_conditioned_data(project_id, dataset_id, df, conditionals)

    id_fields = [ fp for fp in db_access.get_field_properties(project_id, dataset_id) if fp['is_id']]

    generating_procedure_to_data_function = {
        GeneratingProcedure.AGG.value: get_agg_data,
        GeneratingProcedure.IND_VAL.value: get_ind_val_data,
        GeneratingProcedure.BIN_AGG.value: get_bin_agg_data,
        GeneratingProcedure.MULTIGROUP_COUNT.value: get_multigroup_count_data,
        GeneratingProcedure.MULTIGROUP_AGG.value: get_multigroup_agg_data,
        GeneratingProcedure.VAL_BOX.value: get_val_box_data,
        GeneratingProcedure.VAL_AGG.value: get_val_agg_data,
        GeneratingProcedure.VAL_VAL.value: get_raw_comparison_data,
        GeneratingProcedure.VAL_COUNT.value: get_val_count_data,
        GeneratingProcedure.AGG_AGG.value: get_agg_agg_data,
    }
    data = generating_procedure_to_data_function[gp](df,
        args,
        id_fields=id_fields,
        precomputed=precomputed,
        config=config,
        data_formats=data_formats
    )

    logger.debug('Data for %s: %s', gp, time() - start_time)
    return data
示例#13
0
def get_viz_data_from_enumerated_spec(spec, project_id, conditionals, config, df=None, precomputed={}, data_formats=['visualize', 'table', 'score']):
    '''
    Returns a dictionary containing data corresponding to spec (in automated-viz
    structure), and all necessary information to interpret data.

    There are three types of formats:
        Score: a dict of lists for scoring
        Visualize: a list of dicts (collection)
        Table: {columns: list, data: matrix}

    Args:
    spec, dataset_id, project_id, format (list of 'score', 'visualize', or 'table')
    Returns:
        data specified by spec, in specified format

    '''
    for f in data_formats:
        if f not in [u'score', u'visualize', u'table', u'count']:
            raise ValueError('Passed incorrect data format', f)
    final_data = dict([(f, {}) for f in data_formats])

    gp = spec['generating_procedure']
    args = spec['args']
    dataset_id = spec['dataset_id']

    logger.debug('Generating Procedure: %s', gp)
    logger.debug('Arguments: %s', args)
    start_time = time()

    if df is None:
        df = get_data(project_id=project_id, dataset_id=dataset_id)
        df = get_conditioned_data(project_id, dataset_id, df, conditionals)

    id_fields = [ fp for fp in db_access.get_field_properties(project_id, dataset_id) if fp['is_id']]

    generating_procedure_to_data_function = {
        GeneratingProcedure.AGG.value: get_agg_data,
        GeneratingProcedure.IND_VAL.value: get_ind_val_data,
        GeneratingProcedure.BIN_AGG.value: get_bin_agg_data,
        GeneratingProcedure.MULTIGROUP_COUNT.value: get_multigroup_count_data,
        GeneratingProcedure.MULTIGROUP_AGG.value: get_multigroup_agg_data,
        GeneratingProcedure.VAL_BOX.value: get_val_box_data,
        GeneratingProcedure.VAL_AGG.value: get_val_agg_data,
        GeneratingProcedure.VAL_VAL.value: get_raw_comparison_data,
        GeneratingProcedure.VAL_COUNT.value: get_val_count_data,
        GeneratingProcedure.AGG_AGG.value: get_agg_agg_data,
    }
    data = generating_procedure_to_data_function[gp](df,
        args,
        id_fields=id_fields,
        precomputed=precomputed,
        config=config,
        data_formats=data_formats
    )

    logger.debug('Data for %s: %s', gp, time() - start_time)
    return data
示例#14
0
def get_data(project_id=None, dataset_id=None, nrows=None, field_properties=[]):
    if IMD.hasData(dataset_id):
        logger.debug('Accessing from IMD, project_id: %s, dataset_id: %s', project_id, dataset_id)
        df = IMD.getData(dataset_id)
        return df

    dataset = db_access.get_dataset(project_id, dataset_id)
    dialect = dataset['dialect']
    encoding = dataset.get('encoding', 'utf-8')

    if dataset['storage_type'] == 's3':
        if dataset['preloaded']:
            file_obj = s3_client.get_object(
                Bucket=current_app.config['AWS_DATA_BUCKET'],
                Key="-1/%s" % dataset['file_name']
            )
        else:
            file_obj = s3_client.get_object(
                Bucket=current_app.config['AWS_DATA_BUCKET'],
                Key="%s/%s" % (str(project_id), dataset['file_name'])
            )
        accessor = file_obj['Body']

    if dataset['storage_type'] == 'file':
        accessor = dataset['path']

    if not field_properties:
        field_properties = db_access.get_field_properties(project_id, dataset_id)

    df = pd.read_table(
        accessor,
        error_bad_lines = False,
        encoding = encoding,
        skiprows = dataset['offset'],
        sep = dialect['delimiter'],
        engine = 'c',
        # dtype = field_to_type_mapping,
        escapechar = dialect['escapechar'],
        doublequote = dialect['doublequote'],
        quotechar = dialect['quotechar'],
        parse_dates = True,
        nrows = nrows,
        thousands = ','
    )
    sanitized_df = sanitize_df(df)
    coerced_df = coerce_types(sanitized_df, field_properties)

    IMD.insertData(dataset_id, coerced_df)
    return coerced_df
示例#15
0
def get_full_fields_for_conditionals(conditionals, dataset_id, project_id):
    conditionals_with_full_docs = {'and': [], 'or': []}
    field_properties = db_access.get_field_properties(project_id, dataset_id)

    for clause, conditional_list in conditionals.iteritems():
        for conditional in conditional_list:
            new_conditional = {
                'operation': conditional['operation'],
                'criteria': conditional['criteria']
            }
            matched_field_doc = next((f for f in field_properties if f['id'] == conditional['field_id']), None)
            new_conditional['field'] = {
                'general_type': matched_field_doc['general_type'],
                'name': matched_field_doc['name']
            }
            conditionals_with_full_docs[clause].append(new_conditional)

    return conditionals_with_full_docs
def run_comparison_from_spec(spec, project_id):
    # 1) Parse and validate arguments
    indep = spec.get('indep', [])
    dep = spec.get('dep', [])
    dataset_id = spec.get('dataset_id')
    test = spec.get('test', 'ttest')
    if not (dataset_id and dep):
        return 'Not passed required parameters', 400

    fields = db_access.get_field_properties(project_id, dataset_id)

    # 2) Access dataset
    df = get_data(project_id=project_id, dataset_id=dataset_id)
    df = df.dropna()  # Remove unclean

    # 3) Run test based on parameters and arguments
    comparison_result = run_comparison(df, fields, indep, dep, test)
    return {'data': comparison_result}, 200
示例#17
0
def run_comparison_from_spec(spec, project_id, conditionals=[]):
    dependent_variables_names = spec.get('dependentVariablesNames', [])
    independent_variables_names = spec.get('independentVariablesNames', [])  # [ iv[1] for iv in independent_variables ]
    dataset_id = spec.get('datasetId')
    significance_cutoff = spec.get('significanceCutoff', 0.05)
    independence = spec.get('independence', True)

    if not (dataset_id): return 'Not passed required parameters', 400

    all_fields = db_access.get_field_properties(project_id, dataset_id)
    dependent_variables = [ f for f in all_fields if f['name'] in dependent_variables_names ]
    independent_variables = [ f for f in all_fields if f['name'] in independent_variables_names ]

    can_run_numerical_comparison_independent = len([ iv for iv in independent_variables if iv['scale'] == 'continuous' ]) >= 2 and len(dependent_variables_names) == 0
    can_run_numerical_comparison_dependent = len([ dv for dv in dependent_variables if dv['scale'] == 'continuous' ]) >= 2 and len(independent_variables_names) == 0
    can_run_numerical_comparison = (can_run_numerical_comparison_dependent or can_run_numerical_comparison_independent)

    can_run_anova = (len(dependent_variables) and len(independent_variables))
    df = get_data(project_id=project_id, dataset_id=dataset_id)
    df_conditioned = get_conditioned_data(project_id, dataset_id, df, conditionals)
    df_subset = df_conditioned[ dependent_variables_names + independent_variables_names ]
    df_ready = df_subset.dropna(how='any')  # Remove unclean
    
    result = {}
    NUM_GROUPS_CUTOFF = 15
    if can_run_anova:
        anova = run_anova(df_ready, independent_variables_names, dependent_variables_names, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF)
        anova_boxplot_data = get_anova_boxplot_data(project_id, dataset_id, df_ready, independent_variables_names, dependent_variables_names, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF)
        pairwise_comparison_data = get_pairwise_comparison_data(df_ready, independent_variables_names, dependent_variables_names, significance_cutoff=significance_cutoff, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF)
        result.update({
            'anova': anova,
            'anova_boxplot': anova_boxplot_data,
            'pairwise_comparison': pairwise_comparison_data,
        })

    if can_run_numerical_comparison:
        if can_run_numerical_comparison_independent:
            numerical_comparison_data = run_valid_numerical_comparison_tests(df_ready, independent_variables_names, independence=True)
        if can_run_numerical_comparison_dependent:
            numerical_comparison_data = run_valid_numerical_comparison_tests(df_ready, dependent_variables_names, independence=False)
        result['numerical_comparison'] = numerical_comparison_data

    return result, 200
示例#18
0
def save_field_properties(all_properties_result, dataset_id, project_id):
    ''' Upsert all field properties corresponding to a dataset '''
    logger.debug('In save_field_properties for dataset_id %s and project_id %s', dataset_id, project_id)

    all_properties = all_properties_result['result']
    field_properties_with_id = []
    for field_properties in all_properties:
        name = field_properties['name']

        existing_field_properties = db_access.get_field_properties(project_id, dataset_id, name=name)

        if existing_field_properties:
            field_properties = db_access.update_field_properties(project_id, dataset_id, **field_properties)
        else:
            field_properties = db_access.insert_field_properties(project_id, dataset_id, **field_properties)
        field_properties_with_id.append(field_properties)
    return {
        'desc': 'Saved %s field properties' % len(field_properties_with_id),
        'result': {
            'id': dataset_id
        }
    }
示例#19
0
def enumerate_viz_specs(project_id,
                        dataset_id,
                        selected_fields,
                        recommendation_types=[],
                        spec_limit=None,
                        expanded_spec_limit=20):
    '''
    TODO Move key filtering to the db query
    TODO Incorporate 0D and 1D data returns
    '''
    specs = []
    num_selected_fields = len(selected_fields)

    # Get field properties
    desired_keys = [
        'is_id', 'is_unique', 'general_type', 'type', 'scale', 'name', 'id',
        'contiguous'
    ]
    raw_field_properties = db_access.get_field_properties(project_id,
                                                          dataset_id,
                                                          is_id=False)
    field_properties = [{k: field[k]
                         for k in desired_keys}
                        for field in raw_field_properties]

    if selected_fields:
        selected_field_docs, c_fields, c_fields_not_selected, q_fields, q_fields_not_selected, t_fields, t_fields_not_selected = \
            get_selected_fields(field_properties, selected_fields)

        if 'baseline' in recommendation_types:
            baseline_viz_specs = get_baseline_viz_specs(selected_field_docs)
            specs.extend([
                dict(s, recommendation_type='baseline')
                for s in baseline_viz_specs
            ])

        if 'subset' in recommendation_types:
            subset_viz_specs = get_subset_viz_specs(c_fields, q_fields,
                                                    t_fields,
                                                    c_fields_not_selected,
                                                    q_fields_not_selected,
                                                    t_fields_not_selected)
            specs.extend([
                dict(s, recommendation_type='subset') for s in subset_viz_specs
            ])

        if 'exact' in recommendation_types:
            exact_viz_specs = get_exact_viz_specs(c_fields, q_fields, t_fields,
                                                  c_fields_not_selected,
                                                  q_fields_not_selected,
                                                  t_fields_not_selected)
            specs.extend([
                dict(s, recommendation_type='exact') for s in exact_viz_specs
            ])

        if 'expanded' in recommendation_types:
            expanded_viz_specs = get_expanded_viz_specs(
                c_fields, q_fields, t_fields, c_fields_not_selected,
                q_fields_not_selected, t_fields_not_selected)
            if expanded_spec_limit:
                expanded_viz_specs = expanded_viz_specs[:expanded_spec_limit]
            specs.extend([
                dict(s, recommendation_type='expanded')
                for s in expanded_viz_specs
            ])

    else:
        if 'exact' in recommendation_types:
            baseline_viz_specs = get_baseline_viz_specs(field_properties)
            specs.extend([
                dict(s, recommendation_type='exact')
                for s in baseline_viz_specs
            ])

    # Deduplicate
    specs = get_list_of_unique_dicts(specs)

    # Limit Number of specs
    if spec_limit:
        specs = specs[:spec_limit]

    # Assign viz_types and dataset_id
    for spec in specs:
        spec['dataset_id'] = dataset_id

    logger.info('Number of unique specs: %s', len(specs))

    return specs