Exemplo n.º 1
0
    def __get_unmatched(self, item: dict):
        unmatched = {}
        for (key, value) in item.items():
            if not Utils.empty(value):
                value = Utils.clean_if_nan(value)
                if value is not None:
                    unmatched[key] = value

        return unmatched
Exemplo n.º 2
0
    def __get_unmatched(self, data, labels):
        unmatched = {}
        for ix, col in enumerate(data):
            if col is not None:
                if self.__has(labels, ix):
                    label = labels[ix]
                else:
                    label = str(ix)

                col = Utils.clean_if_nan(col)
                if col is not None and not Utils.empty(col):
                    unmatched[label] = col

        return unmatched
Exemplo n.º 3
0
    def __get_unmatched(self, xmlitem, inputunmatched=None):
        unmatched = {}
        if inputunmatched is not None:
            unmatched = inputunmatched

        for ix, item in enumerate(list(xmlitem)):
            if self.__has_children(item):
                unmatched[item.tag] = {}
                self.__get_unmatched(item, unmatched[item.tag])
            else:
                value = Utils.clean_if_nan(item.text)
                if value is not None and not Utils.empty(value):
                    unmatched[item.tag] = value

        return unmatched
Exemplo n.º 4
0
    def get(self, key, subject, multiple=None, value_index=None, default=None):
        split = key.split('.')
        split_count = len(split)
        for index, val in enumerate(split):
            if val not in subject:
                if default is None:
                    return False
                else:
                    return default

            it = index + 1

            if it == split_count:
                subject = subject[val]
                if type(subject) is list and len(subject) > 0:
                    if type(value_index) is int:
                        try:
                            subject = subject[value_index]
                        except:
                            if default is None:
                                return False
                            else:
                                return default
                    elif type(value_index) is str:
                        if value_index == 'first':
                            subject = subject[0]
                        elif value_index == 'last':
                            subject = subject[-1]
            else:
                subject = subject[val]

        # Set to None if value is NaN
        subject = Utils.clean_if_nan(subject)

        return subject
Exemplo n.º 5
0
    def process(self):
        self.__csv = pd.read_csv(self.filepath, delimiter=self.delimiter, skip_blank_lines=self.skip_blank_lines, header=None)
        self.__csv = self.__csv.values

        labels = None
        result = []

        for it, data in enumerate(self.__csv):
            # Updating stats
            self.__stats['total_count_with_header'] += 1

            # Skipping the first line if needed
            if self.skip_header and it == 0:
                labels = data

                # Updating stats
                self.__stats['header_skipped'] = True

                continue

            # Updating stats
            self.__stats['total_count'] += 1

            item = {}
            cols_to_delete = []
            for map_key, map_value in self.mapping.items():

                if 'col' in map_value:
                    col = int(map_value['col'])

                    default = None
                    if 'default' in map_value:
                        default = map_value['default']

                    if col == '_all_':
                        finalvalue = data
                    else:
                        finalvalue = data[col]

                    # Set to None if value is NaN
                    finalvalue = Utils.clean_if_nan(finalvalue)

                    if finalvalue is None:
                        if default is None:
                            finalvalue = ''
                        else:
                            finalvalue = default

                    if 'transformations' in map_value:
                        finalvalue = handle_transformations(map_value['transformations'], finalvalue, error_tolerance=self.__error_tolerance)

                    item = apply_value(item, map_key, finalvalue)

                    if 'conditions' in map_value:
                        finalvalue = handle_conditions(map_value['conditions'], item, data)
                        item = apply_value(item, map_key, finalvalue)

                    # To remember which cols have already been retrieved
                    if self.__save_unmatched:
                        cols_to_delete.append(col)
                elif 'value' in map_value:
                    finalvalue = map_value['value']
                    if type(finalvalue) == str:
                        finalvalue = finalvalue.replace('$subject', 'item')
                        expr = parser.expr(finalvalue)
                        finalvalue = eval(expr.compile(''))

                    # Set to None if value is NaN
                    finalvalue = Utils.clean_if_nan(finalvalue)

                    item = apply_value(item, map_key, finalvalue)

                    if 'conditions' in map_value:
                        finalvalue = handle_conditions(map_value['conditions'], item, data)
                        item = apply_value(item, map_key, finalvalue)
                elif 'conditions' in map_value:
                    finalvalue = handle_conditions(map_value['conditions'], item, data)
                    item = apply_value(item, map_key, finalvalue)
                else:
                    text = '{} : No supported options found in mapping. Supported: [col, value, conditions]'.format(map_key)
                    if self.__error_tolerance:
                        Utils.log('error', text)
                        continue
                    else:
                        raise Exception(text)

            # Unmatched
            if self.__save_unmatched:
                for col in cols_to_delete:
                    data[col] = None

                item[self.__unmatched_key] = self.__get_unmatched(data, labels)

            result.append(item)

        return result
Exemplo n.º 6
0
    def process(self):
        from xml.etree import ElementTree as ET

        self.xml = ET.parse(self.filepath)

        result = []
        for xmlitem in self.xml.findall(self.item_node):
            # Updating stats
            self.__stats['total_count'] += 1

            item = {}
            for yaml_key, yaml_value in self.mapping.items():
                if 'col' in yaml_value:
                    col = yaml_value['col']

                    multiple = None
                    value_index = None
                    default = None
                    raw = True

                    if 'multiple' in yaml_value:
                        multiple = yaml_value['multiple']

                    if 'index' in yaml_value:
                        value_index = yaml_value['index']

                    if 'default' in yaml_value:
                        default = yaml_value['default']

                    if 'raw' in yaml_value:
                        raw_ = yaml_value['raw']
                        if type(raw_) is bool:
                            raw = raw_

                    if col == '_all_':
                        finalvalue = xmlitem
                    else:
                        finalvalue = self.get(col, xmlitem, multiple,
                                              value_index, default, raw)

                    if 'transformations' in yaml_value:
                        finalvalue = handle_transformations(
                            yaml_value['transformations'],
                            finalvalue,
                            error_tolerance=self.error_tolerance)
                    item = apply_value(item, yaml_key, finalvalue)

                    if 'conditions' in yaml_value:
                        finalvalue = handle_conditions(
                            yaml_value['conditions'], item)
                        item = apply_value(item, yaml_key, finalvalue)

                    # Deleting the value from original input object
                    if self.__save_unmatched:
                        self.__delete(col, xmlitem)
                elif 'value' in yaml_value:
                    finalvalue = yaml_value['value']
                    if type(finalvalue) == str:
                        finalvalue = finalvalue.replace('$subject', 'item')
                        expr = parser.expr(finalvalue)
                        finalvalue = eval(expr.compile(''))

                    # Set to None if value is NaN
                    finalvalue = Utils.clean_if_nan(finalvalue)

                    item = apply_value(item, yaml_key, finalvalue)

                    if 'conditions' in yaml_value:
                        finalvalue = handle_conditions(
                            yaml_value['conditions'], item)
                        item = apply_value(item, yaml_key, finalvalue)
                elif 'conditions' in yaml_value:
                    finalvalue = handle_conditions(yaml_value['conditions'],
                                                   item)
                    item = apply_value(item, yaml_key, finalvalue)
                else:
                    text = '{} : No supported options found in mapping. Supported: [col, value, conditions]'.format(
                        yaml_key)
                    if self.error_tolerance:
                        Utils.log('error', text)
                        continue
                    else:
                        raise Exception(text)

            # Unmatched
            if self.__save_unmatched:
                item[self.__unmatched_key] = self.__get_unmatched(xmlitem)

            result.append(item)

        return result
Exemplo n.º 7
0
    def get(self,
            key,
            subject,
            multiple=None,
            value_index=None,
            default=None,
            raw=True):
        split = key.split('.')
        split_count = len(split)
        for index, val in enumerate(split):
            if '$subject' == val:
                continue

            it = index + 1

            if it != split_count:
                value = subject.find(val)
            else:
                value = subject.findall(val)

            if value is None:
                if default is None:
                    return False
                else:
                    return default

            if type(value) is list and len(value) == 0:
                if default is None:
                    return False
                else:
                    return default

            if it == split_count:
                if type(value) is not list:
                    if not multiple:
                        if raw:
                            subject = value.text
                        else:
                            subject = value
                    else:
                        res = []
                        if raw:
                            res.append(value.text)
                        else:
                            res.append(value)
                        subject = res
                else:
                    if not multiple:
                        if not value_index:
                            if raw:
                                subject = value[0].text
                            else:
                                subject = value[0]
                        else:
                            if type(value_index) is int:
                                try:
                                    if raw:
                                        subject = value[value_index].text
                                    else:
                                        subject = value[value_index]
                                except:
                                    if default is None:
                                        subject = False
                                    else:
                                        subject = default
                    else:
                        res = []
                        if not value_index:
                            for item in value:
                                if raw:
                                    res.append(item.text)
                                else:
                                    res.append(item)

                            subject = res
                        else:
                            if type(value_index) is int:
                                try:
                                    if raw:
                                        res.append(value[value_index].text)
                                    else:
                                        res.append(value[value_index])
                                except:
                                    pass

                                subject = res
            else:
                subject = value

        # Set to None if value is NaN
        subject = Utils.clean_if_nan(subject)

        if subject is None and default is not None:
            subject = default

        return subject
Exemplo n.º 8
0
    def lazy_process(self):
        import lxml.etree as ET

        self.xml = ET.iterparse(self.filepath)

        result = []
        for ev, elem in iter(self.xml):

            if elem.tag == self.item_node:
                # Updating stats
                self.__stats['total_count'] += 1

                item = {}
                for map_key, map_value in self.mapping.items():
                    if 'col' in map_value:
                        col = map_value['col']

                        multiple = None
                        value_index = None
                        default = None
                        raw = True

                        if 'multiple' in map_value:
                            multiple = map_value['multiple']

                        if 'index' in map_value:
                            value_index = map_value['index']

                        if 'default' in map_value:
                            default = map_value['default']

                        if 'raw' in map_value:
                            raw_ = map_value['raw']
                            if type(raw_) is bool:
                                raw = raw_

                        finalvalue = self.get(col, elem, multiple, value_index,
                                              default, raw)

                        if 'transformations' in map_value:
                            finalvalue = handle_transformations(
                                map_value['transformations'],
                                finalvalue,
                                error_tolerance=self.error_tolerance)

                        item = apply_value(item, map_key, finalvalue)

                        if 'conditions' in map_value:
                            finalvalue = handle_conditions(
                                map_value['conditions'], item)
                            item = apply_value(item, map_key, finalvalue)

                        # Deleting the value from original input object
                        if self.__save_unmatched:
                            self.__delete(col, elem)
                    elif 'value' in map_value:
                        finalvalue = map_value['value']
                        if type(finalvalue) == str:
                            finalvalue = finalvalue.replace('$subject', 'item')
                            expr = parser.expr(finalvalue)
                            finalvalue = eval(expr.compile(''))

                        # Set to None if value is NaN
                        finalvalue = Utils.clean_if_nan(finalvalue)

                        item = apply_value(item, map_key, finalvalue)

                        if 'conditions' in map_value:
                            finalvalue = handle_conditions(
                                map_value['conditions'], item)
                            item = apply_value(item, map_key, finalvalue)
                    elif 'conditions' in map_value:
                        finalvalue = handle_conditions(map_value['conditions'],
                                                       item)
                        item = apply_value(item, map_key, finalvalue)
                    else:
                        text = '{} : No supported options found in mapping. Supported: [col, value, conditions]'.format(
                            map_key)
                        if self.error_tolerance:
                            Utils.log('error', text)
                            continue
                        else:
                            raise Exception(text)

                # Unmatched
                if self.__save_unmatched:
                    item[self.__unmatched_key] = self.__get_unmatched(elem)

                result.append(item)
                if (len(result) % self.bulksize) == 0:
                    self.callback(result)
                    result.clear()
                    gc.collect()

                # Clearing the element now that the values have been extracted
                elem.clear()
                for ancestor in elem.xpath('ancestor-or-self::*'):
                    while ancestor.getprevious() is not None:
                        del ancestor.getparent()[0]

        if len(result) > 0:
            self.callback(result)
            result.clear()
            gc.collect()
Exemplo n.º 9
0
    def process(self):
        import json

        with open(self.__filepath, 'r') as fh:
            jsoncontent = fh.read()
            self.json = json.loads(jsoncontent)

        result = []
        content = self.json

        if type(self.__root_node) is not str:
            iterator = iter(content)
        else:
            iterator = iter(content[self.__root_node])

        for jsonitem in iterator:
            # Updating stats
            self.__stats['total_count'] += 1

            item = {}
            for yaml_key, yaml_value in self.__mapping.items():
                if 'col' in yaml_value:
                    col = yaml_value['col']

                    multiple = None
                    value_index = None
                    default = None

                    if 'multiple' in yaml_value:
                        multiple = yaml_value['multiple']

                    if 'index' in yaml_value:
                        val_index = yaml_value['index']
                        if type(val_index) is int:
                            value_index = val_index

                    if 'default' in yaml_value:
                        default = yaml_value['default']

                    if col == '_all_':
                        finalvalue = jsonitem
                    else:
                        finalvalue = self.get(col, jsonitem, multiple,
                                              value_index, default)

                    if 'transformations' in yaml_value:
                        finalvalue = handle_transformations(
                            yaml_value['transformations'],
                            finalvalue,
                            error_tolerance=self.__error_tolerance)

                    item = apply_value(item, yaml_key, finalvalue)

                    if 'conditions' in yaml_value:
                        finalvalue = handle_conditions(
                            yaml_value['conditions'], item, jsonitem)
                        item = apply_value(item, yaml_key, finalvalue)

                    # Deleting the value from original input object
                    if self.__save_unmatched:
                        self.__delete(col, jsonitem)
                elif 'value' in yaml_value:
                    finalvalue = yaml_value['value']
                    if type(finalvalue) == str:
                        finalvalue = finalvalue.replace('$subject', 'item')
                        expr = parser.expr(finalvalue)
                        finalvalue = eval(expr.compile(''))

                    # Clean if NaN
                    finalvalue = Utils.clean_if_nan(finalvalue)

                    item = apply_value(item, yaml_key, finalvalue)

                    if 'conditions' in yaml_value:
                        finalvalue = handle_conditions(
                            yaml_value['conditions'], item)
                        item = apply_value(item, yaml_key, finalvalue)
                elif 'conditions' in yaml_value:
                    finalvalue = handle_conditions(yaml_value['conditions'],
                                                   item)

                    # Set to None if value is NaN
                    Utils.clean_if_nan(finalvalue)

                    item = apply_value(item, yaml_key, finalvalue)

            # Unmatched
            if self.__save_unmatched:
                item[self.__unmatched_key] = self.__get_unmatched(jsonitem)

            result.append(item)

        return result
Exemplo n.º 10
0
    def lazy_process(self):
        import ijson

        root_node = 'item'
        if self.__root_node is not None and type(self.__root_node) is str:
            root_node = '{}.item'.format(self.__root_node)

        with open(self.__filepath, 'rb') as fh:
            self.__content = ijson.items(fh, root_node)
            results = []
            it = 0

            # Iterating over JSON generator
            for jsonobject in self.__content:
                # Updating stats
                self.__stats['total_count'] += 1

                item = {}

                # For each JSON Object, iterating over the YAML mapping and retrieving data
                for yaml_key, yaml_value in self.__mapping.items():

                    if 'col' in yaml_value:
                        col = yaml_value['col']

                        multiple = None
                        value_index = None
                        default = None

                        if 'multiple' in yaml_value:
                            multiple = yaml_value['multiple']

                        if 'index' in yaml_value:
                            val_index = yaml_value['index']
                            if type(val_index) is int:
                                value_index = val_index

                        if 'default' in yaml_value:
                            default = yaml_value['default']

                        if col == '_all_':
                            finalvalue = jsonobject
                        else:
                            finalvalue = self.get(col, jsonobject, multiple,
                                                  value_index, default)

                        # If transformations are defined in the mapping, applying them
                        if 'transformations' in yaml_value:
                            finalvalue = handle_transformations(
                                yaml_value['transformations'],
                                finalvalue,
                                error_tolerance=self.__error_tolerance)

                        item = apply_value(item, yaml_key, finalvalue)

                        # Handling conditions
                        if 'conditions' in yaml_value:
                            finalvalue = handle_conditions(
                                yaml_value['conditions'], item, jsonobject)
                            item = apply_value(item, yaml_key, finalvalue)

                        # Deleting the value from original input object
                        if self.__save_unmatched:
                            self.__delete(col, jsonobject)

                    elif 'value' in yaml_value:
                        finalvalue = yaml_value['value']
                        if type(finalvalue) == str:
                            finalvalue = finalvalue.replace('$subject', 'item')
                            expr = parser.expr(finalvalue)
                            finalvalue = eval(expr.compile(''))

                        # Set to None if value is NaN
                        finalvalue = Utils.clean_if_nan(finalvalue)

                        item = apply_value(item, yaml_key, finalvalue)

                        if 'conditions' in yaml_value:
                            finalvalue = handle_conditions(
                                yaml_value['conditions'], item)
                            item = apply_value(item, yaml_key, finalvalue)
                    elif 'conditions' in yaml_value:
                        finalvalue = handle_conditions(
                            yaml_value['conditions'], item)
                        item = apply_value(item, yaml_key, finalvalue)
                    else:
                        text = '{} : No supported options found in mapping. Supported: [col, value, conditions]'.format(
                            yaml_key)
                        if self.__error_tolerance:
                            Utils.log('error', text)
                            continue
                        else:
                            raise Exception(text)

                # Unmatched
                if self.__save_unmatched:
                    item[self.__unmatched_key] = self.__get_unmatched(
                        jsonobject)

                results.append(item)

                if len(results) % self.__bulksize == 0:
                    self.__callback(results)
                    results = []
                    gc.collect()

            if len(results) > 0:
                self.__callback(results)
                results = []
                gc.collect()