Exemplo n.º 1
0
    def merge_files(self, files, output):
        """
        Processess all csv files given
        Matches them according to the fixed_columns
        Leaves output csv at output argument
        """
        columns = self.fixed_columns[:] + self.merge_columns[:]
        data = {}

        # For each file
        for idx, path in enumerate(files):
            print(str(int(idx * 100 / len(files))) + '%')
            # Split content in headers and data
            headers, rows = self.get_headers_and_data(path)

            # Indices (fixed -> header index)
            regex_match = self.new_columns_regex.search(path)
            prepend = regex_match.group() if regex_match else u''
            indices = self.build_column_indices(headers, columns, prepend)

            # print(columns)
            # For each row of data
            for row in rows:
                # Get the data for each column
                new_data = self.build_data(row, indices, columns)

                # Get the unique identifier
                identifier_data = new_data[0:len(self.fixed_columns)]
                identifier = easyio.merge(identifier_data, self.separator)

                if identifier not in data:
                    data[identifier] = new_data

                else:
                    if len(new_data) > len(data[identifier]):
                        data[identifier].extend(
                            [u''] * (len(new_data) - len(data[identifier])))
                    for i, x in enumerate(new_data):
                        if len(x) > 0:
                            data[identifier][i] = x

        # Build content for CSV
        lines = []

        headers = columns[:]
        lines.append(easyio.merge(headers, self.separator))

        for key in data:
            lines.append(easyio.merge(data[key], self.separator))

        content = '\n'.join(lines)

        # Write the CSV file
        easyio.write_file(output, content)
Exemplo n.º 2
0
    def _collapse_headers(self, path, content, fixed_columns, optional_columns,
                          ignore_columns):
        """
        Internal function to collapse headers
        """
        lines = content.split('\n')

        current_headers = [
            '' for header in easyio.split(lines[0], self.separator)
        ]

        columns = fixed_columns[:] + optional_columns[:]

        column_lock = [False] * len(current_headers)
        header_ok = [False] * len(fixed_columns) +\
            [True] * len(optional_columns)

        while not all(header_ok):
            if len(lines) == 0:
                break

            new_headers = easyio.split(lines[0], self.separator)
            lines = lines[1:]

            for i, header in enumerate(new_headers):
                if column_lock[i]:
                    continue

                # Propagation
                # if i > 0 and new_headers[i] == '':
                #     new_headers[i] = new_headers[i - 1]

                match = easyio.match(header, columns)
                if match != -1:
                    current_headers[i] = header
                    header_ok[match] = True
                    column_lock[i] = True
                elif easyio.match(current_headers[i], ignore_columns) != -1:
                    column_lock[i] = True
                else:
                    current_headers[i] =\
                        (current_headers[i] + ' ' + header).strip()

                    match = easyio.match(current_headers[i], columns)
                    if match != -1:
                        header_ok[match] = True
                        column_lock[i] = True
                    if easyio.match(current_headers[i], ignore_columns) != -1:
                        column_lock[i] = True

        lines.insert(0, easyio.merge(current_headers, self.separator))
        content = '\n'.join(lines)
        return content
Exemplo n.º 3
0
    def _set_headers(self, path, content, headers_dict):
        """
        Internal function to set headers
        """
        lines = content.split('\n')
        current_headers = easyio.split(lines[0], self.separator)
        for key in headers_dict:
            if key < len(current_headers):
                current_headers[key] = headers_dict[key]

        lines[0] = easyio.merge(current_headers, self.separator)
        content = '\n'.join(lines)
        return content
Exemplo n.º 4
0
    def _remove_content(self, path, content, unwanted_content):
        """
        Internal function to remove content
        """
        lines = content.split('\n')
        for i in range(len(lines)):
            data = easyio.split(lines[i], self.separator)
            for j in range(len(data)):
                if easyio.match(data[j], unwanted_content) != -1:
                    data[j] = ''
            lines[i] = easyio.merge(data, self.separator)

        content = '\n'.join(lines)
        return content
Exemplo n.º 5
0
    def _expand_rows(self, path, content):
        """
        Internal function to expand rows
        """
        lines = content.split('\n')
        data = []
        max_columns = 0
        for line in lines:
            columns = easyio.split(line, self.separator)
            max_columns = max(len(columns), max_columns)
            data.append(columns)

        for i in range(len(lines)):
            extra = max_columns - len(data[i])
            data[i].extend([''] * extra)
            lines[i] = easyio.merge(data[i], self.separator)

        content = '\n'.join(lines)
        return content
Exemplo n.º 6
0
    def _remove_empty_columns(self, path, content):
        """
        Internal function to remove empty columns

        >>> content='a,b,,,e\\n1,2,,,5'
        >>> e = Editor()
        >>> e._remove_empty_columns('', content)
        '"a","b","e"\\n"1","2","5"'
        """
        lines = content.split('\n')
        all_data = [easyio.split(line, self.separator) for line in lines]
        has_content = set()
        for data in all_data:
            for i, x in enumerate(data):
                if len(x.strip()) > 0:
                    has_content.add(i)

        new_lines = []
        for data in all_data:
            new_data = [x for i, x in enumerate(data) if i in has_content]
            new_lines.append(easyio.merge(new_data, self.separator))

        content = '\n'.join(new_lines)
        return content