Пример #1
0
def svr_xml2dict(raw_data):
    '''

    This method converts the supplied xml file-object to a python dictionary.

    @raw_data, generally a file (or json string) containing the raw dataset(s),
        to be used when computing a corresponding model. If this argument is a
        file, it needs to be closed.

    @list_observation_label, is a list containing dependent variable
        labels.

    '''

    feature_count = None
    list_dataset = []
    list_observation_label = []
    logger = Logger(__name__, 'error', 'error')

    # convert xml file to python 'dict'
    dataset = xmltodict.parse(raw_data)

    # build 'list_dataset'
    for observation in dataset['dataset']['observation']:
        for key in observation:
            if key == 'criterion':
                observation_label = observation['criterion']
                list_observation_label.append(observation[key])
            elif key == 'predictor':
                for predictor in observation[key]:
                    predictor_label = predictor['label']
                    predictor_value = predictor['value']

                    validate_value = Validator(predictor_value)
                    validate_value.validate_value()
                    list_error_value = validate_value.get_errors()
                    if list_error_value:
                        logger.log(list_error_value)
                        return None
                    else:
                        list_dataset.append({
                            'dep_variable_label':
                            str(observation_label),
                            'indep_variable_label':
                            str(predictor_label),
                            'indep_variable_value':
                            predictor_value
                        })

        # generalized feature count in an observation
        if not feature_count:
            feature_count = len(observation['predictor'])

    # save observation labels, and return
    raw_data.close()
    return {
        'dataset': list_dataset,
        'observation_labels': list_observation_label,
        'feature_count': feature_count
    }
Пример #2
0
def svm_csv2dict(raw_data):
    '''

    This method converts the supplied csv file-object, intended for an svm
    model, to a python dictionary.

    @raw_data, generally a file (or json string) containing the raw dataset(s),
        to be used when computing a corresponding model. If this argument is a
        file, it needs to be closed.

    @list_observation_label, is a list containing dependent variable labels.

    Note: we use the 'Universal Newline Support' with the 'U' parameter when
          opening 'raw_data'. This allows newlines to be understood regardless,
          if the newline character was created in osx, windows, or linux.

    Note: since 'row' is a list, with one comma-delimited string element, the
          following line is required in this method:

          row = row[0].split(',')

        '''

    feature_count = None
    list_dataset = []
    list_observation_label = []
    list_feature_label = []
    logger = Logger(__name__, 'error', 'error')

    # open temporary 'csvfile' reader object
    dataset_reader = csv.reader(
        raw_data,
        delimiter=' ',
        quotechar='|'
    )

    # iterate first row of csvfile
    for row in islice(dataset_reader, 0, 1):

        # iterate each column in a given row
        row_indep_label = row[0].split(',')
        for value in islice(row_indep_label, 1, None):
            list_feature_label.append(str(value))

    # iterate all rows of csvfile
    for dep_index, row in enumerate(islice(dataset_reader, 0, None)):

        # iterate first column of each row (except first)
        row_dep_label = row[0].split(',')
        for value in row_dep_label[:1]:
            list_observation_label.append(str(value))

        # generalized feature count in an observation
        row_indep_variable = row[0].split(',')
        if not feature_count:
            feature_count = len(row_indep_variable) - 1

        # iterate each column in a given row
        for indep_index, value in enumerate(
            islice(row_indep_variable, 1, None)
        ):

            try:
                validate = Validator(value)
                validate.validate_value()

                list_error = validate.get_errors()
                if list_error:
                    logger.log(list_error)
                    return None
                else:
                    value = float(value)
            except Exception as error:
                logger.log(error)
                return False

            list_dataset.append({
                'dep_variable_label': list_observation_label[dep_index],
                'indep_variable_label': list_feature_label[indep_index],
                'indep_variable_value': value
            })

    # close file, save observation labels, and return
    raw_data.close()
    return {
        'dataset': list_dataset,
        'observation_labels': list_observation_label,
        'feature_count': feature_count
    }
Пример #3
0
def svm_json2dict(raw_data, is_json):
    '''

    This method converts the supplied json file-object to a python
    dictionary.

    @raw_data, generally a file (or json string) containing the raw dataset(s),
        to be used when computing a corresponding model. If this argument is a
        file, it needs to be closed.

    @is_json, flag indicating 'raw_data' is a json string.

    @observation_labels, is a list containing dependent variable labels.

    '''

    # local variables
    feature_count = None
    list_dataset = []
    observation_labels = []
    logger = Logger(__name__, 'error', 'error')

    # web-interface
    if not is_json:
        dataset = json.load(raw_data)

        for observation_label in dataset:
            # variables
            observations = dataset[observation_label]

            # dependent variable with single observation
            if type(observations) == dict:
                for feature_label, feature_value in observations.items():
                    # validation
                    validate_fvalue = Validator(feature_value)
                    validate_fvalue.validate_value()

                    if validate_fvalue.get_errors():
                        logger.log(validate_fvalue.get_errors())
                    else:
                        # restructured data
                        list_dataset.append({
                            'dep_variable_label':
                            str(observation_label),
                            'indep_variable_label':
                            str(feature_label),
                            'indep_variable_value':
                            feature_value
                        })

                # generalized feature count in an observation
                if not feature_count:
                    feature_count = len(observations)

            # dependent variable with multiple observations
            elif type(observations) == list:
                for observation in observations:
                    for feature_label, feature_value in observation.items():
                        # validation
                        validate_fvalue = Validator(feature_value)
                        validate_fvalue.validate_value()

                        if validate_fvalue.get_errors():
                            logger.log(validate_fvalue.get_errors())
                        else:
                            # restructured data
                            list_dataset.append({
                                'dep_variable_label':
                                str(observation_label),
                                'indep_variable_label':
                                str(feature_label),
                                'indep_variable_value':
                                feature_value
                            })

                    # generalized feature count in an observation
                    if not feature_count:
                        feature_count = len(observation)

            # list of observation label
            observation_labels.append(observation_label)

    # programmatic-interface
    else:
        dataset = raw_data
        observation_label = raw_data[0]

        # list of observation label
        observation_labels.append(observation_label)

        # dependent variable with single observation
        if type(raw_data[1]) == dict:
            for label, feature in raw_data[1].items():
                # validation
                validate_fvalue = Validator(feature)
                validate_fvalue.validate_value()

                if validate_fvalue.get_errors():
                    logger.log(validate_fvalue.get_errors())
                else:
                    # restructured data
                    list_dataset.append({
                        'dep_variable_label':
                        str(observation_label),
                        'indep_variable_label':
                        str(label),
                        'indep_variable_value':
                        feature
                    })

            # generalized feature count in an observation
            if not feature_count:
                feature_count = len(raw_data[1])

        # dependent variable with multiple observations
        if type(raw_data[1]) == list:
            for feature_set in raw_data[1]:
                for feature_label, feature_value in feature_set.items():
                    # validation
                    validate_fvalue = Validator(feature_value)
                    validate_fvalue.validate_value()

                    if validate_fvalue.get_errors():
                        logger.log(validate_fvalue.get_errors())
                    else:
                        # restructured data
                        list_dataset.append({
                            'dep_variable_label':
                            str(observation_label),
                            'indep_variable_label':
                            str(feature_label),
                            'indep_variable_value':
                            feature_value
                        })

                # generalized feature count in an observation
                if not feature_count:
                    feature_count = len(feature_set)

    # close file
    if not is_json:
        raw_data.close()

    # save observation labels, and return
    return {
        'dataset': list_dataset,
        'observation_labels': observation_labels,
        'feature_count': feature_count
    }
Пример #4
0
def svr_json2dict(raw_data, is_json):
    '''

    This method converts the supplied json file-object to a python
    dictionary.

    @raw_data, generally a file (or json string) containing the raw dataset(s),
        to be used when computing a corresponding model. If this argument is a
        file, it needs to be closed.

    @is_json, flag indicating 'raw_data' is a json string.

    @observation_labels, is a list containing dependent variable labels.

    '''

    # local variables
    feature_count = None
    list_dataset = []
    observation_labels = []
    logger = Logger(__name__, 'error', 'error')

    # web-interface
    if not is_json:
        dataset = json.load(raw_data)
        for criterion, predictors in dataset.items():
            observation_label = criterion

            # list of observation label
            observation_labels.append(criterion)

            # criterion with single observation
            if type(predictors) == dict:
                for label, predictor in predictors.items():
                    # validation (part 1)
                    validate_predictor = Validator(predictor)
                    validate_predictor.validate_value()

                    if validate_predictor.get_errors():
                        logger.log(validate_predictor.get_errors())
                    else:
                        # restructured data
                        list_dataset.append({
                            'dep_variable_label':
                            str(observation_label),
                            'indep_variable_label':
                            str(label),
                            'indep_variable_value':
                            predictor
                        })

                # generalized feature count in an observation
                if not feature_count:
                    feature_count = len(predictors)

            # criterion with multiple observation
            if type(predictors) == list:
                for criterion in predictors:
                    for label, predictor in criterion.items():
                        # validation (part 1)
                        validate_predictor = Validator(predictor)
                        validate_predictor.validate_value()

                        if validate_predictor.get_errors():
                            logger.log(validate_predictor.get_errors())
                        else:
                            # restructured data
                            list_dataset.append({
                                'dep_variable_label':
                                str(observation_label),
                                'indep_variable_label':
                                str(label),
                                'indep_variable_value':
                                predictor
                            })

                        # generalized feature count in an observation
                        if not feature_count:
                            feature_count = len(criterion.items())

    # programmatic-interface
    else:
        dataset = raw_data

        for criterion, predictors in dataset.items():
            # list of observation label
            observation_labels.append(criterion)

            # criterion with single observation
            if type(predictors) == dict:
                for label, predictor in predictors.items():
                    # validation (part 1)
                    validate_predictor = Validator(predictor)
                    validate_predictor.validate_value()

                    if validate_predictor.get_errors():
                        logger.log(validate_predictor.get_errors())
                    else:
                        # restructured data
                        list_dataset.append({
                            'dep_variable_label':
                            str(criterion),
                            'indep_variable_label':
                            str(label),
                            'indep_variable_value':
                            predictor
                        })

            # generalized feature count in an observation
            if not feature_count:
                feature_count = len(predictors.items())

            # criterion with multiple observation
            if type(predictors) == list:
                for single_predictors in predictors:
                    for label, predictor in single_predictors.items():
                        # validation (part 1)
                        validate_predictor = Validator(predictor)
                        validate_predictor.validate_value()

                        if validate_predictor.get_errors():
                            logger.log(validate_predictor.get_errors())
                        else:
                            # restructured data
                            list_dataset.append({
                                'dep_variable_label':
                                str(criterion),
                                'indep_variable_label':
                                str(label),
                                'indep_variable_value':
                                predictor
                            })

                    # generalized feature count in an observation
                    if not feature_count:
                        feature_count = len(single_predictors.items())

    # close file
    if not is_json:
        raw_data.close()

    # save observation labels, and return
    return {
        'dataset': list_dataset,
        'observation_labels': observation_labels,
        'feature_count': feature_count
    }