예제 #1
0
def gen_cyclic_metadata(column_data, parameters=None):

    data_min = min(map(float, column_data))
    data_max = max(map(float, column_data))

    if not parameters:
        parameters = dict(min=data_min, max=data_max)
    else:
        if 'min' not in parameters or 'max' not in parameters:
            raise utils.BayesDBError(
                "Error: cyclic columns require (min, max) parameters." %
                str(value))
        else:
            param_min = float(parameters['min'])
            param_max = float(parameters['max'])
            if data_min < param_min:
                raise utils.BayesDBError(
                    "Error: cyclic contains data less than specified "
                    "minimum %f" % param_min)
            elif data_max > param_max:
                raise utils.BayesDBError(
                    "Error: cyclic contains data greater than specified "
                    "maximum %f" % param_max)
            else:
                parameters = dict(min=param_min, max=param_max)

    return dict(modeltype="vonmises",
                value_to_code=dict(),
                code_to_value=dict(),
                parameters=parameters)
예제 #2
0
 def get_metadata(self, tablename):
     try:
         x = os.path.join(self.data_dir, tablename, 'metadata.pkl')
         f = open(os.path.join(self.data_dir, tablename, 'metadata.pkl'), 'rb')
     except Exception as e:
         raise utils.BayesDBError("Error: metadata does not exist. Has %s been corrupted?"
                                  % self.data_dir)
     try:
         metadata = pickle.load(f)
     except Exception as e:
         raise utils.BayesDBError("Error: metadata file could not be loaded for table %s"
                                  % tablename)
     f.close()
     return metadata
예제 #3
0
 def get_column_label(self, tablename, column_name):
     column_labels = self.get_column_labels(tablename)
     if column_name.lower() in column_labels:
         return column_labels[column_name.lower()]
     else:
         raise utils.BayesDBError('Column %s in btable %s has no label.'
                                  % (column_name, tablename))
예제 #4
0
def gen_categorical_metadata(column_data, parameters=None):
    def get_is_not_nan(el):
        if isinstance(el, str):
            return el.upper() != 'NAN'
        else:
            return True

    # get_is_not_nan = lambda el: el.upper() != 'NAN'
    #
    unique_codes = list(set(column_data))
    unique_codes = filter(get_is_not_nan, unique_codes)
    #
    values = range(len(unique_codes))
    value_to_code = dict(zip(values, unique_codes))
    code_to_value = dict(zip(unique_codes, values))

    # Set cardinality = number of distinct values if not set, otherwise check
    # that cardinality parameter is >= the number of distinct values.
    n_codes = len(unique_codes)
    if not parameters:
        parameters = dict(cardinality=n_codes)
    else:
        parameters['cardinality'] = int(parameters['cardinality'])
        if n_codes > parameters['cardinality']:
            raise utils.BayesDBError(
                "Error: categorical contains more distinct values than "
                "specified cardinality %i" % parameters['cardinality'])

    ret = dict(modeltype="symmetric_dirichlet_discrete",
               value_to_code=value_to_code,
               code_to_value=code_to_value,
               parameters=parameters)

    return ret
예제 #5
0
 def get_latent_states(self, tablename, modelid=None):
     """Return X_L_list, X_D_list, and M_c"""
     metadata = self.get_metadata(tablename)
     models = self.get_models(tablename, modelid)
     if None in models.values():
         raise utils.BayesDBError('Invalid model id. Use "SHOW MODELS FOR <btable>" to see '
                                  'valid model ids.')
     M_c = metadata['M_c']
     X_L_list = [model['X_L'] for model in models.values()]
     X_D_list = [model['X_D'] for model in models.values()]
     return (X_L_list, X_D_list, M_c)
예제 #6
0
 def get_metadata_full(self, tablename):
     try:
         f = open(os.path.join(self.data_dir, tablename, 'metadata_full.pkl'), 'rb')
     except Exception as e:
         raise utils.BayesDBError("Error: metadata_full file doesn't exist. This is most "
                                  "likely a result of this btable being created with an old "
                                  "version of BayesDB. Please try recreating the table from "
                                  "the original csv, and loading any models you might have.")
     metadata = pickle.load(f)
     f.close()
     return metadata
def function_description(func, f_args, M_c):
  function_names = {'_col_typicality': 'typicality',
    '_dependence_probability': 'dependence probability',
    '_correlation': 'correlation',
    '_mutual_information': 'mutual information'
    }

  function_name = function_names[func.__name__]

  if function_name == 'typicality':
    description = 'typicality'
  elif f_args is not None:
    function_arg = M_c['idx_to_name'][str(f_args)]
    description = '%s with %s' % (function_name, function_arg)
  else:
    raise utils.BayesDBError()

  return description
예제 #8
0
def convert_value_to_code(M_c, cidx, value):
    """
    For a column with categorical data, this function takes the raw value
    (e.g. 'Joe' or 234.23409), which is always encoded as a string, and returns the
    'code': the integer used to represent that value in the underlying representation.

    Note that the underlying store 'code_to_value' is unfortunately named backwards.
    TODO: fix the backwards naming.
    """
    column_metadata = M_c['column_metadata'][cidx]
    modeltype = column_metadata['modeltype']
    if modeltype == 'normal_inverse_gamma':
        return float(value)
    elif modeltype == 'vonmises':
        param_min = column_metadata['parameters']['min']
        param_max = column_metadata['parameters']['max']
        return (float(value) - param_min) / (param_max - param_min)
    else:
        try:
            return M_c['column_metadata'][cidx]['code_to_value'][str(value)]
        except KeyError:
            raise utils.BayesDBError("Error: value '%s' not in btable." %
                                     str(value))
예제 #9
0
    def update_schema(self, tablename, mappings):
        """
        mappings is a dict of column name to 'cyclic', 'numerical', 'categorical', 'ignore', or
        'key'.
        TODO: can we get rid of cctypes?
        """
        metadata_full = self.get_metadata_full(tablename)
        cctypes_full = metadata_full['cctypes_full']
        M_c_full = metadata_full['M_c_full']
        raw_T_full = metadata_full['raw_T_full']
        colnames_full = utils.get_all_column_names_in_original_order(M_c_full)
        try:
            parameters_full = [x['parameters'] for x in M_c_full['column_metadata']]
        except KeyError:
            print('WARNING: resetting parameters to defaults. Please check these values with '
                  'DESCRIBE and adjust them manually if necessary.')
            parameters_full = []
            for md in M_c_full['column_metadata']:
                if 'dirichlet' in md['modeltype']:
                    params = {
                        'cardinality': len(md['value_to_code'])
                    }
                elif'vonmises' in md['modeltype']:
                    params = {
                        'min': 0.0,
                        'max': 2.0*3.14159265358979323846264338328
                    }
                else:
                    params = None

                parameters_full.append(params)

            parameters_full = [None for _ in range(len(M_c_full['column_metadata']))]

        # Now, update cctypes_full (cctypes updated later, after removing ignores).
        mapping_set = 'numerical', 'categorical', 'ignore', 'key', 'cyclic'

        for colname, mapping in mappings.items():
            cctype = mapping['cctype']
            parameters = mapping['parameters']

            if colname.lower() not in M_c_full['name_to_idx']:
                raise utils.BayesDBError('Error: column %s does not exist.' % colname)
            elif cctype not in mapping_set:
                raise utils.BayesDBError('Error: datatype %s is not one of the valid datatypes: %s.'
                                         % (mapping, str(mapping_set)))

            cidx = M_c_full['name_to_idx'][colname.lower()]

            # If the column's current type is key, don't allow the change.
            if cctypes_full[cidx] == 'key':
                raise utils.BayesDBError('Error: %s is already set as the table key. To change its '
                                         'type, reload the table using CREATE BTABLE and choose a '
                                         'different key column.' % colname.lower())
            # If the user tries to change a column to key, it's easier to reload the table, since at
            # this point there aren't models anyways. Eventually we can build this in if it's
            # desirable.
            elif cctype == 'key':
                raise utils.BayesDBError('Error: key column already exists. To choose a different '
                                         'key, reload the table using CREATE BTABLE')

            cctypes_full[cidx] = cctype
            parameters_full[cidx] = parameters

        # Make sure there isn't more than one key.
        assert len(filter(lambda x: x == 'key', cctypes_full)) == 1

        T_full, M_r_full, M_c_full, _ = data_utils.gen_T_and_metadata(colnames_full, raw_T_full,
                                                                      cctypes=cctypes_full,
                                                                      parameters=parameters_full)

        # Variables without "_full" don't include ignored columns.
        raw_T, cctypes, colnames, parameters = data_utils.remove_ignore_cols(raw_T_full,
                                                                             cctypes_full,
                                                                             colnames_full,
                                                                             parameters_full)
        T, M_r, M_c, _ = data_utils.gen_T_and_metadata(colnames, raw_T, cctypes=cctypes,
                                                       parameters=parameters)

        # Now, put cctypes, T, M_c, and M_r back into the DB
        self.update_metadata(tablename, M_r, M_c, T, cctypes)
        self.update_metadata_full(tablename, M_r_full, M_c_full, T_full, cctypes_full)

        return self.get_metadata_full(tablename)
예제 #10
0
 def get_models(self, tablename, modelid=None):
     """
     Return the models dict for the table if modelid is None.
     If modelid is an int, then return the model specified by that id.
     If modelid is a list, then get each individual model specified by each int in that list.
     """
     models_dir = os.path.join(self.data_dir, tablename, 'models')
     if os.path.exists(models_dir):
         if modelid is not None:
             def get_single_model(modelid):
                 self.model_locks.acquire(tablename, modelid)
                 # Only return one of the models
                 full_fname = os.path.join(models_dir, 'model_%d.pkl' % modelid)
                 if not os.path.exists(full_fname):
                     self.model_locks.release(tablename, modelid)
                     return None
                 f = open(full_fname, 'rb')
                 m = pickle.load(f)
                 f.close()
                 self.model_locks.release(tablename, modelid)
                 return m
             if type(modelid) == list:
                 models = {}
                 for i in modelid:
                     if not utils.is_int(i):
                         raise utils.BayesDBError('Invalid modelid: %s' % str(modelid))
                     models[i] = get_single_model(int(i))
                 return models
             elif utils.is_int(modelid):
                 return get_single_model(int(modelid))
             else:
                 raise utils.BayesDBError('Invalid modelid: %s' % str(modelid))
         else:
             # Return all the models
             models = {}
             self.model_locks.acquire_table(tablename)
             fnames = os.listdir(models_dir)
             for fname in fnames:
                 if fname.startswith('model_'):
                     model_id = fname[6:]  # remove preceding 'model_'
                     model_id = int(model_id[:-4])  # remove trailing '.pkl' and cast to int
                     full_fname = os.path.join(models_dir, fname)
                     f = open(full_fname, 'rb')
                     m = pickle.load(f)
                     f.close()
                     models[model_id] = m
             self.model_locks.release_table(tablename)
             return models
     else:
         # Backwards compatibility with old model style.
         self.model_locks.acquire_table(tablename)
         try:
             f = open(os.path.join(self.data_dir, tablename, 'models.pkl'), 'rb')
             models = pickle.load(f)
             f.close()
             if modelid is not None:
                 ret = models[modelid]
             else:
                 ret = models
             self.model_locks.release_table(tablename)
             return ret
         except IOError:
             self.model_locks.release_table(tablename)
             return {}
예제 #11
0
    def execute_statement(self,
                          bql_statement_ast,
                          pretty=True,
                          timing=False,
                          plots=None,
                          yes=False,
                          debug=False,
                          pandas_df=None,
                          pandas_output=True,
                          key_column=None,
                          return_raw_result=False,
                          force_output=False):
        """
        Accepts a SINGLE BQL STATEMENT as input, parses it, and executes it if it was parsed
        successfully.

        If pretty=True, then the command output will be pretty-printed as a string.
        If pretty=False, then the command output will be returned as a python object.
        If force_output=True, then results will be returned regardless of pretty

        timing=True prints out how long the command took to execute.

        For commands that have visual results, plots=True will cause those to be displayed
        by matplotlib as graphics rather than being pretty-printed as text.
        (Note that the graphics will also be saved if the user added SAVE TO <filename> to the BQL.)
        """
        if timing:
            start_time = time.time()

        parser_out = None
        # TODO move pyparsing objects out of client into parser
        if debug:
            parser_out = self.parser.parse_single_statement(bql_statement_ast)
        else:
            try:
                parser_out = self.parser.parse_single_statement(
                    bql_statement_ast)
            except Exception as e:
                raise utils.BayesDBParseError(str(e))
        if parser_out is None:
            print(
                "Could not parse command. Try typing 'help' for a list of all commands."
            )
            return
        elif not parser_out:
            return

        method_name, args_dict, client_dict = parser_out
        if client_dict is None:
            client_dict = {}

        # Do stuff now that you know the user's command, but before passing it to engine.
        if method_name == 'execute_file':
            return dict(message='execute_file',
                        bql_string=open(args_dict['filename'], 'r').read())
        elif method_name == 'update_codebook':
            _, codebook_rows = data_utils.read_csv(
                client_dict['codebook_path'], has_header=True)
            # TODO: require specific codebook_header values? Or don't require a header,
            # and if the first value in the header is actually a data column name, assume
            # the first row is codebook data, not a header.

            # Create a dict indexed by column name
            codebook = dict()
            for codebook_row in codebook_rows:
                codebook[codebook_row[0]] = dict(
                    zip(['short_name', 'description', 'value_map'],
                        codebook_row[1:]))

            args_dict['codebook'] = codebook
        elif (method_name == 'drop_btable') and (not yes):
            # If dropping something, ask for confirmation.
            print(
                "Are you sure you want to permanently delete this btable, and all associated "
                "models, without any way to get them back? Enter 'y' if yes.")
            user_confirmation = raw_input()
            if 'y' != user_confirmation.strip():
                return dict(message="Operation canceled by user.")
        elif (method_name == 'drop_models') and (not yes):
            # If dropping something, ask for confirmation.
            print(
                "Are you sure you want to permanently delete model(s), without any way to get "
                "them back? Enter 'y' if yes.")
            user_confirmation = raw_input()
            if 'y' != user_confirmation.strip():
                return dict(message="Operation canceled by user.")
        elif method_name == 'load_models':
            pklpath = client_dict['pkl_path']
            try:
                model_data = pickle.load(
                    gzip.open(self.parser.get_absolute_path(pklpath), 'rb'))
            except IOError as e:
                if pklpath[-7:] != '.pkl.gz':
                    if pklpath[-4:] == '.pkl':
                        model_data = pickle.load(
                            open(self.parser.get_absolute_path(pklpath), 'rb'))
                    else:
                        pklpath = pklpath + ".pkl.gz"
                        model_data = pickle.load(
                            gzip.open(self.parser.get_absolute_path(pklpath),
                                      'rb'))
                else:
                    raise utils.BayesDBError(
                        'Models file %s could not be found.' % pklpath)
            # This is the more recent version, where schema is stored with models.
            if 'schema' in model_data.keys():
                args_dict['models'] = model_data['models']
                args_dict['model_schema'] = model_data['schema']
            # This support older saved models, where only the model info was stored.
            else:
                args_dict['models'] = model_data
                args_dict['model_schema'] = None

            # Older versions of model_schema just had a str cctype as the dict items.
            # Newest version has a dict of cctype and parameters. Use this values to
            # test the recency of the models.
            model_schema = args_dict['model_schema']
            if model_schema:
                model_schema_itemtype = type(
                    model_schema[model_schema.keys()[0]])
            else:
                model_schema_itemtype = None

            if model_schema is None or model_schema_itemtype != dict:
                args_dict['model_schema'] = None
                if not yes:
                    print """WARNING! The models you are currently importing were saved without a schema
                        or without detailed column parameters (probably from a previous version).

                        If you are loading models into the same table from which you created them, problems
                        are unlikely, unless you have dropped models and then updated the schema.

                        If you are loading models into a different table from which you created them, you
                        should verify that the table schemas are the same.

                        Please use "SAVE MODELS FROM <btable> TO <filename.pkl.gz>" to create an updated copy of your models.

                        Are you sure you want to load these model(s)?
                        """
                    user_confirmation = raw_input()
                    if 'y' != user_confirmation.strip():
                        return dict(message="Operation canceled by user.")
        elif method_name == 'create_btable':
            if pandas_df is None:
                header, rows = data_utils.read_csv(client_dict['csv_path'])
            else:
                header, rows = data_utils.read_pandas_df(pandas_df)
            args_dict['header'] = header
            args_dict['raw_T_full'] = rows
            args_dict['key_column'] = key_column
            args_dict['subsample'] = False

            if 'codebook_path' in client_dict:
                _, codebook_rows = data_utils.read_csv(
                    client_dict['codebook_path'], has_header=True)
                # TODO: require specific codebook_header values? Or don't require a header,
                # and if the first value in the header is actually a data column name, assume
                # the first row is codebook data, not a header.

                # Create a dict indexed by column name
                codebook = dict()
                for codebook_row in codebook_rows:
                    codebook[codebook_row[0]] = dict(
                        zip(['short_name', 'description', 'value_map'],
                            codebook_row[1:]))
                args_dict['codebook'] = codebook
            else:
                warning = dedent("""
                WARNING!

                You are creating a btable without a codebook, which will make interpretation
                of results more difficult. Codebooks should be in CSV format with each row
                corresponding to one column of the original data. The codebook should have four
                columns:

                1. actual column name
                2. short column description
                3. long column description
                4. value map (optional, only used for categorical columns - should be in JSON
                   format)
                """)
                print(warning)

            # Display warning messages and get confirmation if btable is too large.
            # Ask user if they want to turn on subsampling.
            max_columns = 200
            max_rows = 1000
            max_cells = 100000
            message = None
            if not yes:
                if len(rows[0]) > max_columns:
                    message = "The btable you are uploading has %d columns, but BayesDB is " \
                              "currently designed to support only %d columns. If you proceed, " \
                              "performance may suffer unless you set many columns' datatypes to " \
                              "'ignore'. Would you like to continue? Enter 'y' if yes." \
                              % (len(rows[0]), max_columns)
                if len(rows) > max_rows:
                    message = "The btable you are uploading has %d rows, but BayesDB is currently "\
                              "designed to support only %d rows. If you proceed, performance may "\
                              "suffer. Would you like to continue? Enter 'y' to continue without "\
                              "subsampling, 'n' to abort, 's' to continue by subsampling %d rows, "\
                              "or a positive integer to specify the number of rows to be "\
                              "subsampled." % (len(rows), max_rows, max_rows)
                if len(rows[0]) * len(rows) > max_cells:
                    message = "The btable you are uploading has %d cells, but BayesDB is currently"\
                              " designed to support only %d cells. If you proceed, performance may"\
                              " suffer unless you enable subsampling. Enter 'y' to continue "\
                              " without subsampling, 'n' to abort, 's' to continue by subsampling "\
                              "%d rows, or a positive integer to specify the number of rows to be "\
                              "subsampled." % (len(rows)*len(rows[0]), max_cells, max_rows)
                if message is not None:
                    print(message)
                    user_confirmation = raw_input()
                    if 'y' == user_confirmation.strip():
                        pass
                    elif 'n' == user_confirmation.strip():
                        return dict(message="Operation canceled by user.")
                    elif 's' == user_confirmation.strip():
                        args_dict['subsample'] = min(max_rows, len(rows))
                    elif utils.is_int(user_confirmation.strip()):
                        args_dict['subsample'] = int(user_confirmation.strip())
                    else:
                        return dict(message="Operation canceled by user.")
        elif method_name in ['label_columns', 'update_metadata']:
            if client_dict['source'] == 'file':
                header, rows = data_utils.read_csv(client_dict['csv_path'])
                args_dict['mappings'] = {key: value for key, value in rows}

        # Call engine.
        result = self.call_bayesdb_engine(method_name, args_dict, debug)

        # If error occurred, exit now.
        if 'error' in result and result['error']:
            if pretty:
                print(result['message'])
                if force_output:
                    return result
                else:
                    return result['message']
            else:
                return result

        # Do stuff now that engine has given you output, but before printing the result.
        result = self.callback(method_name, args_dict, client_dict, result)

        if return_raw_result:
            raw_result = {
                'result': result,
                'method_name': method_name,
                'client_dict': client_dict
            }
            print("returning raw result for %s" % (method_name))
            return raw_result

        assert type(result) != int

        if timing:
            end_time = time.time()
            print('Elapsed time: %.2f seconds.' % (end_time - start_time))

        if plots is None:
            plots = 'DISPLAY' in os.environ.keys()

        if 'matrix' in result and (plots or client_dict['filename']):
            # Plot matrices
            plotting_utils.plot_matrix(result['matrix'],
                                       result['column_names'], result['title'],
                                       client_dict['filename'])
            if pretty:
                if 'column_lists' in result:
                    print(
                        self.pretty_print(
                            dict(column_lists=result['column_lists'])))

                if force_output:
                    return result
                else:
                    return self.pretty_print(result)
            else:
                return result
        if ('plot' in client_dict and client_dict['plot']):
            if (plots or client_dict['filename']):
                # Plot generalized histograms or scatterplots

                try:
                    plotting_M_c = result['metadata_full']['M_c_full']
                except KeyError:
                    plotting_M_c = result['M_c']

                plot_remove_key = method_name in ['select', 'infer']
                plotting_utils.plot_general_histogram(
                    result['column_names'],
                    result['data'],
                    plotting_M_c,
                    result['schema_full'],
                    client_dict['filename'],
                    client_dict['scatter'],
                    remove_key=plot_remove_key)
                return self.pretty_print(result)
            else:
                if 'message' not in result:
                    result['message'] = ""
                result['message'] = "Your query indicates that you would like to make a plot, but "\
                                    "in order to do so, you must either enable plotting in a "\
                                    "window or specify a filename to save to by appending 'SAVE "\
                                    "TO <filename>' to this command.\n" + result['message']

        if pretty:
            pp = self.pretty_print(result)
            print(pp)

        # Print warnings last so they're readable without scrolling backwards.
        if 'warnings' in result:
            """ Pretty-print warnings. """
            for warning in result['warnings']:
                print('WARNING: %s' % warning)

        if pandas_output and 'data' in result and 'column_labels' in result:
            result_pandas_df = data_utils.construct_pandas_df(result)
            return result_pandas_df
        else:
            return result
예제 #12
0
def create_pairwise_plot(colnames,
                         data,
                         M_c,
                         schema_full,
                         gsp,
                         remove_key=False):
    columns = colnames[:]
    # Remove key column if present
    if remove_key:
        columns.pop(0)
        data = [row[1:] for row in data]
    # Remove any rows with nan values.
    data = [row for row in data if not any_nan(row)]
    # Stop if there are no rows remaining after cleaning missing values.
    if len(data) == 0:
        raise utils.BayesDBError(
            'There are no datapoints that contain values from every category '
            'specified. Try excluding columns with many NaN values.')

    output = {}

    n_columns = len(columns)
    # Rotate outer labels if there are more than 6 columns to be plotted.
    super_compress = n_columns > 6
    gsp = gs.GridSpec(n_columns, n_columns)
    for i in range(n_columns):
        for j in range(n_columns):
            if j == 0 and i < n_columns - 1:
                # left hand marginals
                sub_colnames = [columns[i]]
                sub_data = [[x[i]] for x in data]
                parsed_data = parse_data_for_hist(sub_colnames, sub_data, M_c,
                                                  schema_full)
                create_plot(parsed_data,
                            p.subplot(gsp[i, j], adjustable='box', aspect=1),
                            False,
                            False,
                            columns[i],
                            horizontal=True,
                            compress=True,
                            super_compress=super_compress)

            elif i == n_columns - 1 and j > 0:
                # bottom marginals
                subdata = None
                if j == 1:
                    sub_colnames = [columns[n_columns - 1]]
                    sub_data = [[x[n_columns - 1]] for x in data]
                else:
                    sub_colnames = [columns[j - 2]]
                    sub_data = [[x[j - 2]] for x in data]
                parsed_data = parse_data_for_hist(sub_colnames, sub_data, M_c,
                                                  schema_full)
                create_plot(parsed_data,
                            p.subplot(gsp[i, j], adjustable='box', aspect=1),
                            False,
                            False,
                            columns[j - 2],
                            horizontal=False,
                            compress=True,
                            super_compress=super_compress)

            elif (j != 0 and i != n_columns - 1) and j < i + 2:
                # pairwise joints
                j_col = j - 2
                if j == 1:
                    j_col = n_columns - 1
                sub_colnames = [columns[i], columns[j_col]]
                sub_data = [[x[i], x[j_col]] for x in data]
                parsed_data = parse_data_for_hist(sub_colnames, sub_data, M_c,
                                                  schema_full)
                create_plot(parsed_data,
                            p.subplot(gsp[i, j]),
                            False,
                            False,
                            horizontal=True,
                            compress=True,
                            super_compress=super_compress)
            else:
                pass
예제 #13
0
def parse_data_for_hist(colnames, data, M_c, schema_full, remove_key=False):
    columns = colnames[:]
    # Remove key column if present
    if remove_key:
        columns.pop(0)
        data = [row[1:] for row in data]
    # Remove any rows with nan values.
    data = [row for row in data if not any_nan(row)]
    # Stop if there are no rows remaining after cleaning missing values.
    if len(data) == 0:
        raise utils.BayesDBError(
            'There are no datapoints that contain values from every category '
            'specified. Try excluding columns with many NaN values.')

    # Pull items from M_c to simplify code throughout the rest of this function
    name_to_idx = M_c['name_to_idx']
    column_metadata = M_c['column_metadata']
    cctypes = [schema_full[column] for column in columns]

    # Treat cyclic as numerical until we establish what we want in a cyclic plot.
    for cctype_idx, cctype in enumerate(cctypes):
        if cctype == 'cyclic':
            cctypes[cctype_idx] = 'numerical'

    output = {}
    if len(columns) == 1:
        np_data = np.array([x[0] for x in data])

        # Allow col_idx to be None, to allow for predictive functions to be plotted.
        if columns[0] in name_to_idx:
            col_idx = name_to_idx[columns[0]]
        else:
            col_idx = None

        # Treat not-column (e.g. function) the same as numerical, since no code to value conversion.
        if col_idx is None or cctypes[0] == 'numerical':
            output['datatype'] = 'cont1D'
            output['data'] = np_data
        elif cctypes[0] == 'categorical':
            unique_labels = sorted(column_metadata[name_to_idx[columns[0]]]
                                   ['code_to_value'].keys())
            counts = []
            for label in unique_labels:
                counts.append(sum(np_data == str(label)))
            output['datatype'] = 'mult1D'
            output['labels'] = unique_labels
            output['data'] = counts

        try:
            # try to get short names from M_c_full
            short_name = M_c['column_codebook'][col_idx]['short_name']
            output['axis_label'] = short_name
            output['title'] = short_name
        except KeyError:
            output['axis_label'] = columns[0]
            output['title'] = columns[0]

    elif len(columns) == 2:
        # Treat not-column (e.g. function) the same as numerical, since no code to value conversion.
        if columns[0] in name_to_idx:
            col_idx_1 = name_to_idx[columns[0]]
        else:
            col_idx_1 = None
        if columns[1] in name_to_idx:
            col_idx_2 = name_to_idx[columns[1]]
        else:
            col_idx_2 = None

        if cctypes[0] == 'numerical' and cctypes[1] == 'numerical':
            output['datatype'] = 'contcont'
            output['data_x'] = [x[0] for x in data]
            output['data_y'] = [x[1] for x in data]

        elif cctypes[0] == 'categorical' and cctypes[1] == 'categorical':
            counts = {}  # keys are (var 1 value, var 2 value)
            # data contains a tuple for each datapoint: (value of var 1, value of var 2)
            for row in data:
                row = tuple(row)
                if row in counts:
                    counts[row] += 1
                else:
                    counts[row] = 1

            # these are the values.
            unique_xs = sorted(
                column_metadata[col_idx_2]['code_to_value'].keys())
            unique_ys = sorted(
                column_metadata[col_idx_1]['code_to_value'].keys())
            unique_ys.reverse()  # Hack to reverse the y's
            x_ordered_codes = [
                du.convert_value_to_code(M_c, col_idx_2, xval)
                for xval in unique_xs
            ]
            y_ordered_codes = [
                du.convert_value_to_code(M_c, col_idx_1, yval)
                for yval in unique_ys
            ]

            # Make count array: indexed by y index, x index
            counts_array = numpy.zeros(shape=(len(unique_ys), len(unique_xs)))
            for i in counts:
                # this converts from value to code
                y_index = y_ordered_codes.index(
                    column_metadata[col_idx_1]['code_to_value'][i[0]])
                x_index = x_ordered_codes.index(
                    column_metadata[col_idx_2]['code_to_value'][i[1]])
                counts_array[y_index][x_index] = float(counts[i])
            output['datatype'] = 'multmult'
            output['data'] = counts_array
            output['labels_x'] = unique_xs
            output['labels_y'] = unique_ys

        elif 'numerical' in cctypes and 'categorical' in cctypes:
            output['datatype'] = 'multcont'
            categories = {}

            categorical_column = cctypes.index('categorical')

            groups = sorted(column_metadata[name_to_idx[
                columns[categorical_column]]]['code_to_value'].keys())
            for i in groups:
                categories[i] = []
            for i in data:
                categories[i[categorical_column]].append(i[1 -
                                                           categorical_column])

            output['groups'] = groups
            output['values'] = [categories[x] for x in groups]
            output['transpose'] = (categorical_column == 0)

        try:
            # try to get short names from M_c_full
            columns[0] = M_c['column_codebook'][col_idx_1]['short_name']
            columns[1] = M_c['column_codebook'][col_idx_2]['short_name']
        except KeyError:
            pass

        output['axis_label_x'] = columns[1]
        output['axis_label_y'] = columns[0]

        output['title'] = columns[0] + ' -versus- ' + columns[1]

    else:
        output['datatype'] = None

    return output
예제 #14
0
def gen_M_c_from_T(T,
                   cctypes=None,
                   colnames=None,
                   parameters=None,
                   codebook=None):
    num_rows = len(T)
    num_cols = len(T[0])
    if cctypes is None:
        cctypes = ['numerical'] * num_cols
    if colnames is None:
        colnames = range(num_cols)
    if parameters is None:
        parameters = [None] * num_cols
    #
    T_array_transpose = numpy.array(T).T
    column_metadata = []
    for cctype, column_data, params in zip(cctypes, T_array_transpose,
                                           parameters):
        metadata_generator = metadata_generator_lookup[cctype]
        metadata = metadata_generator(column_data, params)
        column_metadata.append(metadata)

    column_codebook = []
    for col_idx, colname in enumerate(colnames):
        if codebook and colname in codebook:
            colname_codebook = codebook[colname]

            # update column metadata with value maps
            if colname_codebook['value_map'].upper() != 'NAN':
                if column_metadata[col_idx][
                        'modeltype'] != 'symmetric_dirichlet_discrete':
                    raise utils.BayesDBError(
                        'Value map specified for non-categorical column({})'
                    ).format(colname)

                try:
                    colvm = json.loads(colname_codebook['value_map'])
                    codes = colvm.keys()
                    values = colvm.values()
                    column_metadata[col_idx]['code_to_value'] = colvm
                    column_metadata[col_idx]['value_to_code'] = dict(
                        zip(values, codes))
                    column_metadata[col_idx]['parameters'] = {
                        'cardinality': len(values)
                    }
                except:
                    raise utils.BayesDBError(
                        'Error parsing vaue map in codebook for {}.'.format(
                            colname))
        else:
            colname_codebook = {
                'description': 'No description',
                'short_name': colname,
                'value_map': None
            }
        column_codebook.append(colname_codebook)

    name_to_idx = dict(zip(colnames, range(num_cols)))
    idx_to_name = dict(zip(map(str, range(num_cols)), colnames))

    M_c = dict(name_to_idx=name_to_idx,
               idx_to_name=idx_to_name,
               column_metadata=column_metadata,
               column_codebook=column_codebook)
    return M_c