Пример #1
0
    def sniff_for_dialect(self, delimited_file_path: str) -> csv.Dialect:
        """Sniff delimited file and return csv.Dialect."""

        delimited_filepath = join_n_norm(delimited_file_path)
        assert verify_filepath(delimited_file_path)

        sniffed_encoding = self.sniff_for_encoding(delimited_filepath)

        with open(delimited_filepath, "rt", encoding=sniffed_encoding) as ftt:
            working_text = ftt.read(self.bytes_to_read)

            logger.debug("Sniffing for dialect; '{}'.".format(delimited_filepath))
            try:
                if self.poss_delimiters:
                    sniffed = Sniffer().sniff(working_text, delimiters=self.poss_delimiters)
                else:
                    sniffed = Sniffer().sniff(working_text)
                    if len(sniffed.delimiter) != 1 or len(sniffed.quotechar) != 1:
                        poss_delimiters = '|\t,'
                        sniffed = Sniffer().sniff(working_text,
                                                  delimiters=poss_delimiters)
            except csv.Error as csv_error:
                logger.error("There is something wrong with '{}'.".format(delimited_filepath))
                raise csv.Error(csv_error)

        return sniffed
Пример #2
0
    def sniff_for_encoding(self, delimited_file_path: str) -> str:
        """Sniff file to try and determine and return encoding else returns latin-1."""

        working_filepath = join_n_norm(delimited_file_path)
        verify_filepath(working_filepath)

        detector = UniversalDetector()

        with open(working_filepath, 'rb') as working_file:
            logger.debug("Sniffing for encoding; '{}'".format(delimited_file_path))
            for chunk in range(self.encoding_chunks_to_read):
                if not detector.done:
                    detector.feed(working_file.read(4096))
                else:
                    break
            detector.close()
            if not detector.done:
                encoding = "latin-1"
                logger.debug("Encoding not detected; set to latin-1.")
            else:
                encoding = detector.result['encoding']
                logger.debug("Detected encoding; returning '{}'".format(encoding))

        return encoding
Пример #3
0
def integrity_check(infile_path: str) -> str:
    """
    Take filepath and report on the number of columns detected per line, extra quotechars,
    etc. Helps to detect problems in source files which may cause issues when creating
    schema files and indicator for whether file will need preprocessor.
    """

    working_filepath = join_n_norm(infile_path)
    assert verify_filepath(working_filepath)

    dd = DialectDetector()
    has_headers = csv.Sniffer().has_header(working_filepath)
    dialect = dd.sniff_for_dialect(working_filepath)
    encoding = dd.sniff_for_encoding(working_filepath)

    quotes_per_line = list()
    columns_per_line = list()
    output = list()

    output.append("Filename: '{}'".format(split(working_filepath)[-1]))
    output.append("Detected Field Delimiter: [{}]".format(dialect.delimiter))
    output.append("Detected Text Delimiter: [{}]".format(dialect.quotechar))
    output.append("Checking for even # of Text Delimiters on every line...")

    with open(working_filepath, "rt", encoding=encoding) as infile:
        reader = infile.readlines()
        uneven_tdelimiter_list = list()

        for n, line in enumerate(reader, start=1):
            quotes = Counter(line)[dialect.quotechar]
            quotes_per_line.append(quotes)

            if quotes % 2 != 0:
                uneven_tdelimiter_list.append(
                    "Line {} = {} [{}]'s".format(n, quotes, dialect.quotechar))

        if uneven_tdelimiter_list:
            output.append("FAILED: lines with uneven number of text delimiters detected.")
            output.append("PROBLEM LINES:")
            output.append("\n".join(uneven_tdelimiter_list))
            output.append("TEXT DELIMITER DISPERSAL:")
            quote_counter = Counter(quotes_per_line)
            quote_dispersal = list()
            for n in quote_counter.most_common():
                quote_dispersal.append(
                    "{} lines = {} text delimiters.".format(n[1], n[0]))
            output.append("\n".join(quote_dispersal))
        else:
            output.append("PASSED: NO lines with uneven text delimiter count found.")

    output.append("Checking for the same number of columns/fields on every line...")
    with open(working_filepath, "rt", encoding=encoding) as infile:
        reader = csv.reader(infile, dialect=dialect)
        headers = next(reader)
        if not has_headers:
            reader.seek(0)

        header_column_count = len(headers)

        output.append("Columns in header/first row = {}".format(header_column_count))

        problem_line_numbers = list()
        bad_column_count = list()

        for n, line in enumerate(reader):
            column_count = len(line)
            columns_per_line.append(column_count)

            if header_column_count != column_count:
                bad_column_count.append(
                    "Line {} = {} columns/fields.".format(n, column_count))
                problem_line_numbers.append(str(n))

    if not problem_line_numbers:
        output.append("PASSED: All lines have consistent column/field count.")
    else:
        output.append("FAILED: lines with different number of columns/fields detected.")
        output.append("PROBLEM LINES:")
        output.append("\n".join(bad_column_count))
        output.append("COLUMN/FIELD DISPERSAL:")
        column_counter = Counter(columns_per_line)
        column_dispersal = list()
        for n in column_counter.most_common():
            column_dispersal.append("{} lines = {} columns/fields.".format(
                n[1], n[0]))
        output.append("\n".join(column_dispersal))
    return "\n".join(output)