Пример #1
0
def svr_xml2dict(raw_data):
    '''

    This method converts the supplied xml file-object to a python dictionary.

    @raw_data, generally a file (or json string) containing the raw dataset(s),
        to be used when computing a corresponding model. If this argument is a
        file, it needs to be closed.

    @list_observation_label, is a list containing dependent variable
        labels.

    '''

    feature_count = None
    list_dataset = []
    list_observation_label = []
    logger = Logger(__name__, 'error', 'error')

    # convert xml file to python 'dict'
    dataset = xmltodict.parse(raw_data)

    # build 'list_dataset'
    for observation in dataset['dataset']['observation']:
        for key in observation:
            if key == 'criterion':
                observation_label = observation['criterion']
                list_observation_label.append(observation[key])
            elif key == 'predictor':
                for predictor in observation[key]:
                    predictor_label = predictor['label']
                    predictor_value = predictor['value']

                    validate_value = Validator(predictor_value)
                    validate_value.validate_value()
                    list_error_value = validate_value.get_errors()
                    if list_error_value:
                        logger.log(list_error_value)
                        return None
                    else:
                        list_dataset.append({
                            'dep_variable_label':
                            str(observation_label),
                            'indep_variable_label':
                            str(predictor_label),
                            'indep_variable_value':
                            predictor_value
                        })

        # generalized feature count in an observation
        if not feature_count:
            feature_count = len(observation['predictor'])

    # save observation labels, and return
    raw_data.close()
    return {
        'dataset': list_dataset,
        'observation_labels': list_observation_label,
        'feature_count': feature_count
    }
Пример #2
0
def csv2dict(raw_data):
    '''

    This method converts the supplied csv file-object to a python dictionary.

    @raw_data, generally a file (or json string) containing the raw dataset(s),
        to be used when computing a corresponding model. If this argument is a
        file, it needs to be closed.

    Note: we use the 'Universal Newline Support' with the 'U' parameter when
          opening 'raw_data'. This allows newlines to be understood regardless,
          if the newline character was created in osx, windows, or linux.

    Note: since 'row' is a list, with one comma-delimited string element, the
          following line is required in this method:

          row = row[0].split(',')

    '''

    # local variables:
    dataset = []
    validate = Validator()

    # local variable: open temporary 'csvfile' reader object
    dataset_reader = csv.reader(
        raw_data,
        delimiter=' ',
        quotechar='|'
    )

    # first row of csvfile: get all columns, except first
    for row in islice(dataset_reader, 0, 1):
        indep_labels_list = row[0].split(',')[1:]

    # all rows of csvfile: except first row
    for dep_index, row in enumerate(islice(dataset_reader, 0, None)):
        row_arr = row[0].split(',')
        features_list = row_arr[1:]

        # merge lists into dict if each independent variable validates
        if all(validate.validate_value(item) for item in features_list):
            features_dict = {k: v for k, v in zip(indep_labels_list, features_list)}
            error = None
        else:
            error = 'csv conversion failed: ' + validate.get_error()

        observation = {
            'dependent-variable': row_arr[:1][0],
            'independent-variables': [features_dict],
            'error': error
        }

        dataset.append(observation)

    # close file, return dataset
    raw_data.close()
    return dataset
Пример #3
0
def csv2dict(raw_data):
    '''

    This method converts the supplied csv file-object to a python dictionary.

    @raw_data, generally a file (or json string) containing the raw dataset(s),
        to be used when computing a corresponding model. If this argument is a
        file, it needs to be closed.

    Note: we use the 'Universal Newline Support' with the 'U' parameter when
          opening 'raw_data'. This allows newlines to be understood regardless,
          if the newline character was created in osx, windows, or linux.

    Note: since 'row' is a list, with one comma-delimited string element, the
          following line is required in this method:

          row = row[0].split(',')

    '''

    # local variables:
    dataset = []
    validate = Validator()

    # local variable: open temporary 'csvfile' reader object
    dataset_reader = csv.reader(raw_data, delimiter=' ', quotechar='|')

    # first row of csvfile: get all columns, except first
    for row in islice(dataset_reader, 0, 1):
        indep_labels_list = row[0].split(',')[1:]

    # all rows of csvfile: except first row
    for dep_index, row in enumerate(islice(dataset_reader, 0, None)):
        row_arr = row[0].split(',')
        features_list = row_arr[1:]

        # merge lists into dict if each independent variable validates
        if all(validate.validate_value(item) for item in features_list):
            features_dict = {
                k: v
                for k, v in zip(indep_labels_list, features_list)
            }
            error = None
        else:
            error = 'csv conversion failed: ' + validate.get_error()

        observation = {
            'dependent-variable': row_arr[:1][0],
            'independent-variables': [features_dict],
            'error': error
        }

        dataset.append(observation)

    # close file, return dataset
    raw_data.close()
    return dataset
Пример #4
0
def xml2dict(raw_data):
    '''

    This method converts the supplied xml file-object to a python dictionary.

    @raw_data, generally a file (or json string) containing the raw dataset(s),
        to be used when computing a corresponding model. If this argument is a
        file, it needs to be closed.

    '''

    # local variables
    dataset = []
    validate = Validator()

    # local variable: open temporary 'xmltodict' object
    dataset_reader = xmltodict.parse(raw_data)

    # build dataset
    for observation in dataset_reader['dataset']['observation']:
        features_dict = {}
        dependent_variable = observation['dependent-variable']

        # define features set if independent variable validates
        for feature in observation['independent-variable']:
            if validate.validate_value(feature['value']):
                features_dict[feature['label']] = feature['value']
                error = None
            else:
                error = 'xml conversion failed: ' + validate.get_error()

        adjusted = {
            'dependent-variable': dependent_variable,
            'independent-variables': [features_dict],
            'error': error
        }

        dataset.append(adjusted)

    # save observation labels, and return
    raw_data.close()
    return dataset
Пример #5
0
def xml2dict(raw_data):
    '''

    This method converts the supplied xml file-object to a python dictionary.

    @raw_data, generally a file (or json string) containing the raw dataset(s),
        to be used when computing a corresponding model. If this argument is a
        file, it needs to be closed.

    '''

    # local variables
    dataset = []
    validate = Validator()

    # local variable: open temporary 'xmltodict' object
    dataset_reader = xmltodict.parse(raw_data)

    # build dataset
    for observation in dataset_reader['dataset']['observation']:
        features_dict = {}
        dependent_variable = observation['dependent-variable']

        # define features set if independent variable validates
        for feature in observation['independent-variable']:
            if validate.validate_value(feature['value']):
                features_dict[feature['label']] = feature['value']
                error = None
            else:
                error = 'xml conversion failed: ' + validate.get_error()

        adjusted = {
            'dependent-variable': dependent_variable,
            'independent-variables': [features_dict],
            'error': error
        }

        dataset.append(adjusted)

    # save observation labels, and return
    raw_data.close()
    return dataset
Пример #6
0
def svm_csv2dict(raw_data):
    '''

    This method converts the supplied csv file-object, intended for an svm
    model, to a python dictionary.

    @raw_data, generally a file (or json string) containing the raw dataset(s),
        to be used when computing a corresponding model. If this argument is a
        file, it needs to be closed.

    @list_observation_label, is a list containing dependent variable labels.

    Note: we use the 'Universal Newline Support' with the 'U' parameter when
          opening 'raw_data'. This allows newlines to be understood regardless,
          if the newline character was created in osx, windows, or linux.

    Note: since 'row' is a list, with one comma-delimited string element, the
          following line is required in this method:

          row = row[0].split(',')

        '''

    feature_count = None
    list_dataset = []
    list_observation_label = []
    list_feature_label = []
    logger = Logger(__name__, 'error', 'error')

    # open temporary 'csvfile' reader object
    dataset_reader = csv.reader(
        raw_data,
        delimiter=' ',
        quotechar='|'
    )

    # iterate first row of csvfile
    for row in islice(dataset_reader, 0, 1):

        # iterate each column in a given row
        row_indep_label = row[0].split(',')
        for value in islice(row_indep_label, 1, None):
            list_feature_label.append(str(value))

    # iterate all rows of csvfile
    for dep_index, row in enumerate(islice(dataset_reader, 0, None)):

        # iterate first column of each row (except first)
        row_dep_label = row[0].split(',')
        for value in row_dep_label[:1]:
            list_observation_label.append(str(value))

        # generalized feature count in an observation
        row_indep_variable = row[0].split(',')
        if not feature_count:
            feature_count = len(row_indep_variable) - 1

        # iterate each column in a given row
        for indep_index, value in enumerate(
            islice(row_indep_variable, 1, None)
        ):

            try:
                validate = Validator(value)
                validate.validate_value()

                list_error = validate.get_errors()
                if list_error:
                    logger.log(list_error)
                    return None
                else:
                    value = float(value)
            except Exception as error:
                logger.log(error)
                return False

            list_dataset.append({
                'dep_variable_label': list_observation_label[dep_index],
                'indep_variable_label': list_feature_label[indep_index],
                'indep_variable_value': value
            })

    # close file, save observation labels, and return
    raw_data.close()
    return {
        'dataset': list_dataset,
        'observation_labels': list_observation_label,
        'feature_count': feature_count
    }
Пример #7
0
def svm_json2dict(raw_data, is_json):
    '''

    This method converts the supplied json file-object to a python
    dictionary.

    @raw_data, generally a file (or json string) containing the raw dataset(s),
        to be used when computing a corresponding model. If this argument is a
        file, it needs to be closed.

    @is_json, flag indicating 'raw_data' is a json string.

    @observation_labels, is a list containing dependent variable labels.

    '''

    # local variables
    feature_count = None
    list_dataset = []
    observation_labels = []
    logger = Logger(__name__, 'error', 'error')

    # web-interface
    if not is_json:
        dataset = json.load(raw_data)

        for observation_label in dataset:
            # variables
            observations = dataset[observation_label]

            # dependent variable with single observation
            if type(observations) == dict:
                for feature_label, feature_value in observations.items():
                    # validation
                    validate_fvalue = Validator(feature_value)
                    validate_fvalue.validate_value()

                    if validate_fvalue.get_errors():
                        logger.log(validate_fvalue.get_errors())
                    else:
                        # restructured data
                        list_dataset.append({
                            'dep_variable_label':
                            str(observation_label),
                            'indep_variable_label':
                            str(feature_label),
                            'indep_variable_value':
                            feature_value
                        })

                # generalized feature count in an observation
                if not feature_count:
                    feature_count = len(observations)

            # dependent variable with multiple observations
            elif type(observations) == list:
                for observation in observations:
                    for feature_label, feature_value in observation.items():
                        # validation
                        validate_fvalue = Validator(feature_value)
                        validate_fvalue.validate_value()

                        if validate_fvalue.get_errors():
                            logger.log(validate_fvalue.get_errors())
                        else:
                            # restructured data
                            list_dataset.append({
                                'dep_variable_label':
                                str(observation_label),
                                'indep_variable_label':
                                str(feature_label),
                                'indep_variable_value':
                                feature_value
                            })

                    # generalized feature count in an observation
                    if not feature_count:
                        feature_count = len(observation)

            # list of observation label
            observation_labels.append(observation_label)

    # programmatic-interface
    else:
        dataset = raw_data
        observation_label = raw_data[0]

        # list of observation label
        observation_labels.append(observation_label)

        # dependent variable with single observation
        if type(raw_data[1]) == dict:
            for label, feature in raw_data[1].items():
                # validation
                validate_fvalue = Validator(feature)
                validate_fvalue.validate_value()

                if validate_fvalue.get_errors():
                    logger.log(validate_fvalue.get_errors())
                else:
                    # restructured data
                    list_dataset.append({
                        'dep_variable_label':
                        str(observation_label),
                        'indep_variable_label':
                        str(label),
                        'indep_variable_value':
                        feature
                    })

            # generalized feature count in an observation
            if not feature_count:
                feature_count = len(raw_data[1])

        # dependent variable with multiple observations
        if type(raw_data[1]) == list:
            for feature_set in raw_data[1]:
                for feature_label, feature_value in feature_set.items():
                    # validation
                    validate_fvalue = Validator(feature_value)
                    validate_fvalue.validate_value()

                    if validate_fvalue.get_errors():
                        logger.log(validate_fvalue.get_errors())
                    else:
                        # restructured data
                        list_dataset.append({
                            'dep_variable_label':
                            str(observation_label),
                            'indep_variable_label':
                            str(feature_label),
                            'indep_variable_value':
                            feature_value
                        })

                # generalized feature count in an observation
                if not feature_count:
                    feature_count = len(feature_set)

    # close file
    if not is_json:
        raw_data.close()

    # save observation labels, and return
    return {
        'dataset': list_dataset,
        'observation_labels': observation_labels,
        'feature_count': feature_count
    }
Пример #8
0
def dataset2dict(model_type, upload):
    '''

    This method converts the supplied csv, or xml file upload(s) to a uniform
    dict object, using necessary converter utility functions.

    @upload, uploaded dataset(s).

    '''

    # local variables
    list_error = []
    converted = []
    Validate = Validator()
    datasets = upload['dataset']
    settings = upload['properties']
    stream = settings.get('stream', None)
    list_model_type = current_app.config.get('MODEL_TYPE')

    # programmatic-interface
    if stream == 'True':
        session_name = settings['session_name']
        dataset_type = settings['dataset_type']

        # convert dataset(s) into extended list
        for dataset in datasets:
            # scrape url content
            if dataset_type == 'dataset_url':
                r = requests.get(dataset)
                instance = r.json()['dataset']

            else:
                instance = [dataset]

            if instance:
                if model_type == list_model_type[0]:
                    error = Validate.validate_classification(instance)

                elif model_type == list_model_type[1]:
                    error = Validate.validate_regression(instance)

                if error:
                    list_error.append({
                        'location': session_name,
                        'message': error
                    })

                converted.extend(instance)

    # web-interface
    else:
        dataset_type = settings['dataset_type']
        if dataset_type == 'file_upload':
            adjusted_datasets = upload['dataset']['file_upload']

        else:
            adjusted_datasets = upload['dataset']['dataset_url']

        # convert dataset(s) into extended list
        for dataset in adjusted_datasets:
            location = dataset['filename']

            # scrape url content
            if dataset_type == 'dataset_url':
                r = requests.get(dataset)
                instance = [r.json()][0]['dataset']

            # file content
            else:
                if dataset['filename'].lower().endswith('.csv'):
                    instance = csv2dict(dataset['file'])

                elif dataset['filename'].lower().endswith('.json'):
                    # load dataset instance
                    try:
                        instance = json.load(dataset['file'])['dataset']
                    except:
                        instance = dataset['file']
                elif dataset['filename'].lower().endswith('.xml'):
                    instance = xml2dict(dataset['file'])

            if instance:
                if model_type == list_model_type[0]:
                    error = Validate.validate_classification(instance)

                elif model_type == list_model_type[1]:
                    error = Validate.validate_regression(instance)

                if error:
                    list_error.append({'location': location, 'message': error})
                else:
                    converted.extend(instance)

            else:
                list_error.append({
                    'location':
                    location,
                    'message':
                    'empty dataset, or invalid syntax (try lint)'
                })

    # return results
    if list_error:
        return {
            'dataset': converted,
            'settings': settings,
            'error': {
                'validation': list_error
            }
        }

    else:
        return {
            'dataset': converted,
            'settings': settings,
            'error': None,
        }
Пример #9
0
def svr_json2dict(raw_data, is_json):
    '''

    This method converts the supplied json file-object to a python
    dictionary.

    @raw_data, generally a file (or json string) containing the raw dataset(s),
        to be used when computing a corresponding model. If this argument is a
        file, it needs to be closed.

    @is_json, flag indicating 'raw_data' is a json string.

    @observation_labels, is a list containing dependent variable labels.

    '''

    # local variables
    feature_count = None
    list_dataset = []
    observation_labels = []
    logger = Logger(__name__, 'error', 'error')

    # web-interface
    if not is_json:
        dataset = json.load(raw_data)
        for criterion, predictors in dataset.items():
            observation_label = criterion

            # list of observation label
            observation_labels.append(criterion)

            # criterion with single observation
            if type(predictors) == dict:
                for label, predictor in predictors.items():
                    # validation (part 1)
                    validate_predictor = Validator(predictor)
                    validate_predictor.validate_value()

                    if validate_predictor.get_errors():
                        logger.log(validate_predictor.get_errors())
                    else:
                        # restructured data
                        list_dataset.append({
                            'dep_variable_label':
                            str(observation_label),
                            'indep_variable_label':
                            str(label),
                            'indep_variable_value':
                            predictor
                        })

                # generalized feature count in an observation
                if not feature_count:
                    feature_count = len(predictors)

            # criterion with multiple observation
            if type(predictors) == list:
                for criterion in predictors:
                    for label, predictor in criterion.items():
                        # validation (part 1)
                        validate_predictor = Validator(predictor)
                        validate_predictor.validate_value()

                        if validate_predictor.get_errors():
                            logger.log(validate_predictor.get_errors())
                        else:
                            # restructured data
                            list_dataset.append({
                                'dep_variable_label':
                                str(observation_label),
                                'indep_variable_label':
                                str(label),
                                'indep_variable_value':
                                predictor
                            })

                        # generalized feature count in an observation
                        if not feature_count:
                            feature_count = len(criterion.items())

    # programmatic-interface
    else:
        dataset = raw_data

        for criterion, predictors in dataset.items():
            # list of observation label
            observation_labels.append(criterion)

            # criterion with single observation
            if type(predictors) == dict:
                for label, predictor in predictors.items():
                    # validation (part 1)
                    validate_predictor = Validator(predictor)
                    validate_predictor.validate_value()

                    if validate_predictor.get_errors():
                        logger.log(validate_predictor.get_errors())
                    else:
                        # restructured data
                        list_dataset.append({
                            'dep_variable_label':
                            str(criterion),
                            'indep_variable_label':
                            str(label),
                            'indep_variable_value':
                            predictor
                        })

            # generalized feature count in an observation
            if not feature_count:
                feature_count = len(predictors.items())

            # criterion with multiple observation
            if type(predictors) == list:
                for single_predictors in predictors:
                    for label, predictor in single_predictors.items():
                        # validation (part 1)
                        validate_predictor = Validator(predictor)
                        validate_predictor.validate_value()

                        if validate_predictor.get_errors():
                            logger.log(validate_predictor.get_errors())
                        else:
                            # restructured data
                            list_dataset.append({
                                'dep_variable_label':
                                str(criterion),
                                'indep_variable_label':
                                str(label),
                                'indep_variable_value':
                                predictor
                            })

                    # generalized feature count in an observation
                    if not feature_count:
                        feature_count = len(single_predictors.items())

    # close file
    if not is_json:
        raw_data.close()

    # save observation labels, and return
    return {
        'dataset': list_dataset,
        'observation_labels': observation_labels,
        'feature_count': feature_count
    }
Пример #10
0
def dataset2dict(model_type, upload):
    '''

    This method converts the supplied csv, or xml file upload(s) to a uniform
    dict object, using necessary converter utility functions.

    @upload, uploaded dataset(s).

    '''

    # local variables
    list_error = []
    converted = []
    Validate = Validator()
    datasets = upload['dataset']
    settings = upload['properties']
    stream = settings.get('stream', None)
    list_model_type = current_app.config.get('MODEL_TYPE')

    # programmatic-interface
    if stream == 'True':
        session_name = settings['session_name']
        dataset_type = settings['dataset_type']

        # convert dataset(s) into extended list
        for dataset in datasets:
            # scrape url content
            if dataset_type == 'dataset_url':
                r = requests.get(dataset)
                instance = r.json()['dataset']

            else:
                instance = [dataset]

            if instance:
                if model_type == list_model_type[0]:
                    error = Validate.validate_classification(instance)

                elif model_type == list_model_type[1]:
                    error = Validate.validate_regression(instance)

                if error:
                    list_error.append({
                        'location': session_name,
                        'message': error
                    })

                converted.extend(instance)

    # web-interface
    else:
        dataset_type = settings['dataset_type']
        if dataset_type == 'file_upload':
            adjusted_datasets = upload['dataset']['file_upload']

        else:
            adjusted_datasets = upload['dataset']['dataset_url']

        # convert dataset(s) into extended list
        for dataset in adjusted_datasets:
            location = dataset['filename']

            # scrape url content
            if dataset_type == 'dataset_url':
                r = requests.get(dataset)
                instance = [r.json()][0]['dataset']

            # file content
            else:
                if dataset['filename'].lower().endswith('.csv'):
                    instance = csv2dict(dataset['file'])

                elif dataset['filename'].lower().endswith('.json'):
                    # load dataset instance
                    try:
                        instance = json.load(dataset['file'])['dataset']
                    except:
                        instance = dataset['file']
                elif dataset['filename'].lower().endswith('.xml'):
                    instance = xml2dict(dataset['file'])

            if instance:
                if model_type == list_model_type[0]:
                    error = Validate.validate_classification(instance)

                elif model_type == list_model_type[1]:
                    error = Validate.validate_regression(instance)

                if error:
                    list_error.append({
                        'location': location,
                        'message': error
                    })
                else:
                    converted.extend(instance)

            else:
                list_error.append({
                    'location': location,
                    'message': 'empty dataset, or invalid syntax (try lint)'
                })

    # return results
    if list_error:
        return {
            'dataset': converted,
            'settings': settings,
            'error': {
                'validation': list_error
            }
        }

    else:
        return {
            'dataset': converted,
            'settings': settings,
            'error': None,
        }