def process(self): self.__csv = pd.read_csv(self.filepath, delimiter=self.delimiter, skip_blank_lines=self.skip_blank_lines, header=None) self.__csv = self.__csv.values labels = None result = [] for it, data in enumerate(self.__csv): # Updating stats self.__stats['total_count_with_header'] += 1 # Skipping the first line if needed if self.skip_header and it == 0: labels = data # Updating stats self.__stats['header_skipped'] = True continue # Updating stats self.__stats['total_count'] += 1 item = {} cols_to_delete = [] for map_key, map_value in self.mapping.items(): if 'col' in map_value: col = int(map_value['col']) default = None if 'default' in map_value: default = map_value['default'] if col == '_all_': finalvalue = data else: finalvalue = data[col] # Set to None if value is NaN finalvalue = Utils.clean_if_nan(finalvalue) if finalvalue is None: if default is None: finalvalue = '' else: finalvalue = default if 'transformations' in map_value: finalvalue = handle_transformations(map_value['transformations'], finalvalue, error_tolerance=self.__error_tolerance) item = apply_value(item, map_key, finalvalue) if 'conditions' in map_value: finalvalue = handle_conditions(map_value['conditions'], item, data) item = apply_value(item, map_key, finalvalue) # To remember which cols have already been retrieved if self.__save_unmatched: cols_to_delete.append(col) elif 'value' in map_value: finalvalue = map_value['value'] if type(finalvalue) == str: finalvalue = finalvalue.replace('$subject', 'item') expr = parser.expr(finalvalue) finalvalue = eval(expr.compile('')) # Set to None if value is NaN finalvalue = Utils.clean_if_nan(finalvalue) item = apply_value(item, map_key, finalvalue) if 'conditions' in map_value: finalvalue = handle_conditions(map_value['conditions'], item, data) item = apply_value(item, map_key, finalvalue) elif 'conditions' in map_value: finalvalue = handle_conditions(map_value['conditions'], item, data) item = apply_value(item, map_key, finalvalue) else: text = '{} : No supported options found in mapping. Supported: [col, value, conditions]'.format(map_key) if self.__error_tolerance: Utils.log('error', text) continue else: raise Exception(text) # Unmatched if self.__save_unmatched: for col in cols_to_delete: data[col] = None item[self.__unmatched_key] = self.__get_unmatched(data, labels) result.append(item) return result
def lazy_process(self): import lxml.etree as ET self.xml = ET.iterparse(self.filepath) result = [] for ev, elem in iter(self.xml): if elem.tag == self.item_node: # Updating stats self.__stats['total_count'] += 1 item = {} for map_key, map_value in self.mapping.items(): if 'col' in map_value: col = map_value['col'] multiple = None value_index = None default = None raw = True if 'multiple' in map_value: multiple = map_value['multiple'] if 'index' in map_value: value_index = map_value['index'] if 'default' in map_value: default = map_value['default'] if 'raw' in map_value: raw_ = map_value['raw'] if type(raw_) is bool: raw = raw_ finalvalue = self.get(col, elem, multiple, value_index, default, raw) if 'transformations' in map_value: finalvalue = handle_transformations( map_value['transformations'], finalvalue, error_tolerance=self.error_tolerance) item = apply_value(item, map_key, finalvalue) if 'conditions' in map_value: finalvalue = handle_conditions( map_value['conditions'], item) item = apply_value(item, map_key, finalvalue) # Deleting the value from original input object if self.__save_unmatched: self.__delete(col, elem) elif 'value' in map_value: finalvalue = map_value['value'] if type(finalvalue) == str: finalvalue = finalvalue.replace('$subject', 'item') expr = parser.expr(finalvalue) finalvalue = eval(expr.compile('')) # Set to None if value is NaN finalvalue = Utils.clean_if_nan(finalvalue) item = apply_value(item, map_key, finalvalue) if 'conditions' in map_value: finalvalue = handle_conditions( map_value['conditions'], item) item = apply_value(item, map_key, finalvalue) elif 'conditions' in map_value: finalvalue = handle_conditions(map_value['conditions'], item) item = apply_value(item, map_key, finalvalue) else: text = '{} : No supported options found in mapping. Supported: [col, value, conditions]'.format( map_key) if self.error_tolerance: Utils.log('error', text) continue else: raise Exception(text) # Unmatched if self.__save_unmatched: item[self.__unmatched_key] = self.__get_unmatched(elem) result.append(item) if (len(result) % self.bulksize) == 0: self.callback(result) result.clear() gc.collect() # Clearing the element now that the values have been extracted elem.clear() for ancestor in elem.xpath('ancestor-or-self::*'): while ancestor.getprevious() is not None: del ancestor.getparent()[0] if len(result) > 0: self.callback(result) result.clear() gc.collect()
def process(self): from xml.etree import ElementTree as ET self.xml = ET.parse(self.filepath) result = [] for xmlitem in self.xml.findall(self.item_node): # Updating stats self.__stats['total_count'] += 1 item = {} for yaml_key, yaml_value in self.mapping.items(): if 'col' in yaml_value: col = yaml_value['col'] multiple = None value_index = None default = None raw = True if 'multiple' in yaml_value: multiple = yaml_value['multiple'] if 'index' in yaml_value: value_index = yaml_value['index'] if 'default' in yaml_value: default = yaml_value['default'] if 'raw' in yaml_value: raw_ = yaml_value['raw'] if type(raw_) is bool: raw = raw_ if col == '_all_': finalvalue = xmlitem else: finalvalue = self.get(col, xmlitem, multiple, value_index, default, raw) if 'transformations' in yaml_value: finalvalue = handle_transformations( yaml_value['transformations'], finalvalue, error_tolerance=self.error_tolerance) item = apply_value(item, yaml_key, finalvalue) if 'conditions' in yaml_value: finalvalue = handle_conditions( yaml_value['conditions'], item) item = apply_value(item, yaml_key, finalvalue) # Deleting the value from original input object if self.__save_unmatched: self.__delete(col, xmlitem) elif 'value' in yaml_value: finalvalue = yaml_value['value'] if type(finalvalue) == str: finalvalue = finalvalue.replace('$subject', 'item') expr = parser.expr(finalvalue) finalvalue = eval(expr.compile('')) # Set to None if value is NaN finalvalue = Utils.clean_if_nan(finalvalue) item = apply_value(item, yaml_key, finalvalue) if 'conditions' in yaml_value: finalvalue = handle_conditions( yaml_value['conditions'], item) item = apply_value(item, yaml_key, finalvalue) elif 'conditions' in yaml_value: finalvalue = handle_conditions(yaml_value['conditions'], item) item = apply_value(item, yaml_key, finalvalue) else: text = '{} : No supported options found in mapping. Supported: [col, value, conditions]'.format( yaml_key) if self.error_tolerance: Utils.log('error', text) continue else: raise Exception(text) # Unmatched if self.__save_unmatched: item[self.__unmatched_key] = self.__get_unmatched(xmlitem) result.append(item) return result
def lazy_process(self): import ijson root_node = 'item' if self.__root_node is not None and type(self.__root_node) is str: root_node = '{}.item'.format(self.__root_node) with open(self.__filepath, 'rb') as fh: self.__content = ijson.items(fh, root_node) results = [] it = 0 # Iterating over JSON generator for jsonobject in self.__content: # Updating stats self.__stats['total_count'] += 1 item = {} # For each JSON Object, iterating over the YAML mapping and retrieving data for yaml_key, yaml_value in self.__mapping.items(): if 'col' in yaml_value: col = yaml_value['col'] multiple = None value_index = None default = None if 'multiple' in yaml_value: multiple = yaml_value['multiple'] if 'index' in yaml_value: val_index = yaml_value['index'] if type(val_index) is int: value_index = val_index if 'default' in yaml_value: default = yaml_value['default'] if col == '_all_': finalvalue = jsonobject else: finalvalue = self.get(col, jsonobject, multiple, value_index, default) # If transformations are defined in the mapping, applying them if 'transformations' in yaml_value: finalvalue = handle_transformations( yaml_value['transformations'], finalvalue, error_tolerance=self.__error_tolerance) item = apply_value(item, yaml_key, finalvalue) # Handling conditions if 'conditions' in yaml_value: finalvalue = handle_conditions( yaml_value['conditions'], item, jsonobject) item = apply_value(item, yaml_key, finalvalue) # Deleting the value from original input object if self.__save_unmatched: self.__delete(col, jsonobject) elif 'value' in yaml_value: finalvalue = yaml_value['value'] if type(finalvalue) == str: finalvalue = finalvalue.replace('$subject', 'item') expr = parser.expr(finalvalue) finalvalue = eval(expr.compile('')) # Set to None if value is NaN finalvalue = Utils.clean_if_nan(finalvalue) item = apply_value(item, yaml_key, finalvalue) if 'conditions' in yaml_value: finalvalue = handle_conditions( yaml_value['conditions'], item) item = apply_value(item, yaml_key, finalvalue) elif 'conditions' in yaml_value: finalvalue = handle_conditions( yaml_value['conditions'], item) item = apply_value(item, yaml_key, finalvalue) else: text = '{} : No supported options found in mapping. Supported: [col, value, conditions]'.format( yaml_key) if self.__error_tolerance: Utils.log('error', text) continue else: raise Exception(text) # Unmatched if self.__save_unmatched: item[self.__unmatched_key] = self.__get_unmatched( jsonobject) results.append(item) if len(results) % self.__bulksize == 0: self.__callback(results) results = [] gc.collect() if len(results) > 0: self.__callback(results) results = [] gc.collect()