예제 #1
0
파일: csv.py 프로젝트: polkovnik-z/fluxify
    def process(self):
        self.__csv = pd.read_csv(self.filepath, delimiter=self.delimiter, skip_blank_lines=self.skip_blank_lines, header=None)
        self.__csv = self.__csv.values

        labels = None
        result = []

        for it, data in enumerate(self.__csv):
            # Updating stats
            self.__stats['total_count_with_header'] += 1

            # Skipping the first line if needed
            if self.skip_header and it == 0:
                labels = data

                # Updating stats
                self.__stats['header_skipped'] = True

                continue

            # Updating stats
            self.__stats['total_count'] += 1

            item = {}
            cols_to_delete = []
            for map_key, map_value in self.mapping.items():

                if 'col' in map_value:
                    col = int(map_value['col'])

                    default = None
                    if 'default' in map_value:
                        default = map_value['default']

                    if col == '_all_':
                        finalvalue = data
                    else:
                        finalvalue = data[col]

                    # Set to None if value is NaN
                    finalvalue = Utils.clean_if_nan(finalvalue)

                    if finalvalue is None:
                        if default is None:
                            finalvalue = ''
                        else:
                            finalvalue = default

                    if 'transformations' in map_value:
                        finalvalue = handle_transformations(map_value['transformations'], finalvalue, error_tolerance=self.__error_tolerance)

                    item = apply_value(item, map_key, finalvalue)

                    if 'conditions' in map_value:
                        finalvalue = handle_conditions(map_value['conditions'], item, data)
                        item = apply_value(item, map_key, finalvalue)

                    # To remember which cols have already been retrieved
                    if self.__save_unmatched:
                        cols_to_delete.append(col)
                elif 'value' in map_value:
                    finalvalue = map_value['value']
                    if type(finalvalue) == str:
                        finalvalue = finalvalue.replace('$subject', 'item')
                        expr = parser.expr(finalvalue)
                        finalvalue = eval(expr.compile(''))

                    # Set to None if value is NaN
                    finalvalue = Utils.clean_if_nan(finalvalue)

                    item = apply_value(item, map_key, finalvalue)

                    if 'conditions' in map_value:
                        finalvalue = handle_conditions(map_value['conditions'], item, data)
                        item = apply_value(item, map_key, finalvalue)
                elif 'conditions' in map_value:
                    finalvalue = handle_conditions(map_value['conditions'], item, data)
                    item = apply_value(item, map_key, finalvalue)
                else:
                    text = '{} : No supported options found in mapping. Supported: [col, value, conditions]'.format(map_key)
                    if self.__error_tolerance:
                        Utils.log('error', text)
                        continue
                    else:
                        raise Exception(text)

            # Unmatched
            if self.__save_unmatched:
                for col in cols_to_delete:
                    data[col] = None

                item[self.__unmatched_key] = self.__get_unmatched(data, labels)

            result.append(item)

        return result
예제 #2
0
    def lazy_process(self):
        import lxml.etree as ET

        self.xml = ET.iterparse(self.filepath)

        result = []
        for ev, elem in iter(self.xml):

            if elem.tag == self.item_node:
                # Updating stats
                self.__stats['total_count'] += 1

                item = {}
                for map_key, map_value in self.mapping.items():
                    if 'col' in map_value:
                        col = map_value['col']

                        multiple = None
                        value_index = None
                        default = None
                        raw = True

                        if 'multiple' in map_value:
                            multiple = map_value['multiple']

                        if 'index' in map_value:
                            value_index = map_value['index']

                        if 'default' in map_value:
                            default = map_value['default']

                        if 'raw' in map_value:
                            raw_ = map_value['raw']
                            if type(raw_) is bool:
                                raw = raw_

                        finalvalue = self.get(col, elem, multiple, value_index,
                                              default, raw)

                        if 'transformations' in map_value:
                            finalvalue = handle_transformations(
                                map_value['transformations'],
                                finalvalue,
                                error_tolerance=self.error_tolerance)

                        item = apply_value(item, map_key, finalvalue)

                        if 'conditions' in map_value:
                            finalvalue = handle_conditions(
                                map_value['conditions'], item)
                            item = apply_value(item, map_key, finalvalue)

                        # Deleting the value from original input object
                        if self.__save_unmatched:
                            self.__delete(col, elem)
                    elif 'value' in map_value:
                        finalvalue = map_value['value']
                        if type(finalvalue) == str:
                            finalvalue = finalvalue.replace('$subject', 'item')
                            expr = parser.expr(finalvalue)
                            finalvalue = eval(expr.compile(''))

                        # Set to None if value is NaN
                        finalvalue = Utils.clean_if_nan(finalvalue)

                        item = apply_value(item, map_key, finalvalue)

                        if 'conditions' in map_value:
                            finalvalue = handle_conditions(
                                map_value['conditions'], item)
                            item = apply_value(item, map_key, finalvalue)
                    elif 'conditions' in map_value:
                        finalvalue = handle_conditions(map_value['conditions'],
                                                       item)
                        item = apply_value(item, map_key, finalvalue)
                    else:
                        text = '{} : No supported options found in mapping. Supported: [col, value, conditions]'.format(
                            map_key)
                        if self.error_tolerance:
                            Utils.log('error', text)
                            continue
                        else:
                            raise Exception(text)

                # Unmatched
                if self.__save_unmatched:
                    item[self.__unmatched_key] = self.__get_unmatched(elem)

                result.append(item)
                if (len(result) % self.bulksize) == 0:
                    self.callback(result)
                    result.clear()
                    gc.collect()

                # Clearing the element now that the values have been extracted
                elem.clear()
                for ancestor in elem.xpath('ancestor-or-self::*'):
                    while ancestor.getprevious() is not None:
                        del ancestor.getparent()[0]

        if len(result) > 0:
            self.callback(result)
            result.clear()
            gc.collect()
예제 #3
0
    def process(self):
        from xml.etree import ElementTree as ET

        self.xml = ET.parse(self.filepath)

        result = []
        for xmlitem in self.xml.findall(self.item_node):
            # Updating stats
            self.__stats['total_count'] += 1

            item = {}
            for yaml_key, yaml_value in self.mapping.items():
                if 'col' in yaml_value:
                    col = yaml_value['col']

                    multiple = None
                    value_index = None
                    default = None
                    raw = True

                    if 'multiple' in yaml_value:
                        multiple = yaml_value['multiple']

                    if 'index' in yaml_value:
                        value_index = yaml_value['index']

                    if 'default' in yaml_value:
                        default = yaml_value['default']

                    if 'raw' in yaml_value:
                        raw_ = yaml_value['raw']
                        if type(raw_) is bool:
                            raw = raw_

                    if col == '_all_':
                        finalvalue = xmlitem
                    else:
                        finalvalue = self.get(col, xmlitem, multiple,
                                              value_index, default, raw)

                    if 'transformations' in yaml_value:
                        finalvalue = handle_transformations(
                            yaml_value['transformations'],
                            finalvalue,
                            error_tolerance=self.error_tolerance)
                    item = apply_value(item, yaml_key, finalvalue)

                    if 'conditions' in yaml_value:
                        finalvalue = handle_conditions(
                            yaml_value['conditions'], item)
                        item = apply_value(item, yaml_key, finalvalue)

                    # Deleting the value from original input object
                    if self.__save_unmatched:
                        self.__delete(col, xmlitem)
                elif 'value' in yaml_value:
                    finalvalue = yaml_value['value']
                    if type(finalvalue) == str:
                        finalvalue = finalvalue.replace('$subject', 'item')
                        expr = parser.expr(finalvalue)
                        finalvalue = eval(expr.compile(''))

                    # Set to None if value is NaN
                    finalvalue = Utils.clean_if_nan(finalvalue)

                    item = apply_value(item, yaml_key, finalvalue)

                    if 'conditions' in yaml_value:
                        finalvalue = handle_conditions(
                            yaml_value['conditions'], item)
                        item = apply_value(item, yaml_key, finalvalue)
                elif 'conditions' in yaml_value:
                    finalvalue = handle_conditions(yaml_value['conditions'],
                                                   item)
                    item = apply_value(item, yaml_key, finalvalue)
                else:
                    text = '{} : No supported options found in mapping. Supported: [col, value, conditions]'.format(
                        yaml_key)
                    if self.error_tolerance:
                        Utils.log('error', text)
                        continue
                    else:
                        raise Exception(text)

            # Unmatched
            if self.__save_unmatched:
                item[self.__unmatched_key] = self.__get_unmatched(xmlitem)

            result.append(item)

        return result
예제 #4
0
파일: json.py 프로젝트: polkovnik-z/fluxify
    def lazy_process(self):
        import ijson

        root_node = 'item'
        if self.__root_node is not None and type(self.__root_node) is str:
            root_node = '{}.item'.format(self.__root_node)

        with open(self.__filepath, 'rb') as fh:
            self.__content = ijson.items(fh, root_node)
            results = []
            it = 0

            # Iterating over JSON generator
            for jsonobject in self.__content:
                # Updating stats
                self.__stats['total_count'] += 1

                item = {}

                # For each JSON Object, iterating over the YAML mapping and retrieving data
                for yaml_key, yaml_value in self.__mapping.items():

                    if 'col' in yaml_value:
                        col = yaml_value['col']

                        multiple = None
                        value_index = None
                        default = None

                        if 'multiple' in yaml_value:
                            multiple = yaml_value['multiple']

                        if 'index' in yaml_value:
                            val_index = yaml_value['index']
                            if type(val_index) is int:
                                value_index = val_index

                        if 'default' in yaml_value:
                            default = yaml_value['default']

                        if col == '_all_':
                            finalvalue = jsonobject
                        else:
                            finalvalue = self.get(col, jsonobject, multiple,
                                                  value_index, default)

                        # If transformations are defined in the mapping, applying them
                        if 'transformations' in yaml_value:
                            finalvalue = handle_transformations(
                                yaml_value['transformations'],
                                finalvalue,
                                error_tolerance=self.__error_tolerance)

                        item = apply_value(item, yaml_key, finalvalue)

                        # Handling conditions
                        if 'conditions' in yaml_value:
                            finalvalue = handle_conditions(
                                yaml_value['conditions'], item, jsonobject)
                            item = apply_value(item, yaml_key, finalvalue)

                        # Deleting the value from original input object
                        if self.__save_unmatched:
                            self.__delete(col, jsonobject)

                    elif 'value' in yaml_value:
                        finalvalue = yaml_value['value']
                        if type(finalvalue) == str:
                            finalvalue = finalvalue.replace('$subject', 'item')
                            expr = parser.expr(finalvalue)
                            finalvalue = eval(expr.compile(''))

                        # Set to None if value is NaN
                        finalvalue = Utils.clean_if_nan(finalvalue)

                        item = apply_value(item, yaml_key, finalvalue)

                        if 'conditions' in yaml_value:
                            finalvalue = handle_conditions(
                                yaml_value['conditions'], item)
                            item = apply_value(item, yaml_key, finalvalue)
                    elif 'conditions' in yaml_value:
                        finalvalue = handle_conditions(
                            yaml_value['conditions'], item)
                        item = apply_value(item, yaml_key, finalvalue)
                    else:
                        text = '{} : No supported options found in mapping. Supported: [col, value, conditions]'.format(
                            yaml_key)
                        if self.__error_tolerance:
                            Utils.log('error', text)
                            continue
                        else:
                            raise Exception(text)

                # Unmatched
                if self.__save_unmatched:
                    item[self.__unmatched_key] = self.__get_unmatched(
                        jsonobject)

                results.append(item)

                if len(results) % self.__bulksize == 0:
                    self.__callback(results)
                    results = []
                    gc.collect()

            if len(results) > 0:
                self.__callback(results)
                results = []
                gc.collect()