def merge_files(self, files, output): """ Processess all csv files given Matches them according to the fixed_columns Leaves output csv at output argument """ columns = self.fixed_columns[:] + self.merge_columns[:] data = {} # For each file for idx, path in enumerate(files): print(str(int(idx * 100 / len(files))) + '%') # Split content in headers and data headers, rows = self.get_headers_and_data(path) # Indices (fixed -> header index) regex_match = self.new_columns_regex.search(path) prepend = regex_match.group() if regex_match else u'' indices = self.build_column_indices(headers, columns, prepend) # print(columns) # For each row of data for row in rows: # Get the data for each column new_data = self.build_data(row, indices, columns) # Get the unique identifier identifier_data = new_data[0:len(self.fixed_columns)] identifier = easyio.merge(identifier_data, self.separator) if identifier not in data: data[identifier] = new_data else: if len(new_data) > len(data[identifier]): data[identifier].extend( [u''] * (len(new_data) - len(data[identifier]))) for i, x in enumerate(new_data): if len(x) > 0: data[identifier][i] = x # Build content for CSV lines = [] headers = columns[:] lines.append(easyio.merge(headers, self.separator)) for key in data: lines.append(easyio.merge(data[key], self.separator)) content = '\n'.join(lines) # Write the CSV file easyio.write_file(output, content)
def _collapse_headers(self, path, content, fixed_columns, optional_columns, ignore_columns): """ Internal function to collapse headers """ lines = content.split('\n') current_headers = [ '' for header in easyio.split(lines[0], self.separator) ] columns = fixed_columns[:] + optional_columns[:] column_lock = [False] * len(current_headers) header_ok = [False] * len(fixed_columns) +\ [True] * len(optional_columns) while not all(header_ok): if len(lines) == 0: break new_headers = easyio.split(lines[0], self.separator) lines = lines[1:] for i, header in enumerate(new_headers): if column_lock[i]: continue # Propagation # if i > 0 and new_headers[i] == '': # new_headers[i] = new_headers[i - 1] match = easyio.match(header, columns) if match != -1: current_headers[i] = header header_ok[match] = True column_lock[i] = True elif easyio.match(current_headers[i], ignore_columns) != -1: column_lock[i] = True else: current_headers[i] =\ (current_headers[i] + ' ' + header).strip() match = easyio.match(current_headers[i], columns) if match != -1: header_ok[match] = True column_lock[i] = True if easyio.match(current_headers[i], ignore_columns) != -1: column_lock[i] = True lines.insert(0, easyio.merge(current_headers, self.separator)) content = '\n'.join(lines) return content
def _set_headers(self, path, content, headers_dict): """ Internal function to set headers """ lines = content.split('\n') current_headers = easyio.split(lines[0], self.separator) for key in headers_dict: if key < len(current_headers): current_headers[key] = headers_dict[key] lines[0] = easyio.merge(current_headers, self.separator) content = '\n'.join(lines) return content
def _remove_content(self, path, content, unwanted_content): """ Internal function to remove content """ lines = content.split('\n') for i in range(len(lines)): data = easyio.split(lines[i], self.separator) for j in range(len(data)): if easyio.match(data[j], unwanted_content) != -1: data[j] = '' lines[i] = easyio.merge(data, self.separator) content = '\n'.join(lines) return content
def _expand_rows(self, path, content): """ Internal function to expand rows """ lines = content.split('\n') data = [] max_columns = 0 for line in lines: columns = easyio.split(line, self.separator) max_columns = max(len(columns), max_columns) data.append(columns) for i in range(len(lines)): extra = max_columns - len(data[i]) data[i].extend([''] * extra) lines[i] = easyio.merge(data[i], self.separator) content = '\n'.join(lines) return content
def _remove_empty_columns(self, path, content): """ Internal function to remove empty columns >>> content='a,b,,,e\\n1,2,,,5' >>> e = Editor() >>> e._remove_empty_columns('', content) '"a","b","e"\\n"1","2","5"' """ lines = content.split('\n') all_data = [easyio.split(line, self.separator) for line in lines] has_content = set() for data in all_data: for i, x in enumerate(data): if len(x.strip()) > 0: has_content.add(i) new_lines = [] for data in all_data: new_data = [x for i, x in enumerate(data) if i in has_content] new_lines.append(easyio.merge(new_data, self.separator)) content = '\n'.join(new_lines) return content