예제 #1
0
    def sniff_for_dialect(self, delimited_file_path: str) -> csv.Dialect:
        """Sniff delimited file and return csv.Dialect."""

        delimited_filepath = join_n_norm(delimited_file_path)
        assert verify_filepath(delimited_file_path)

        sniffed_encoding = self.sniff_for_encoding(delimited_filepath)

        with open(delimited_filepath, "rt", encoding=sniffed_encoding) as ftt:
            working_text = ftt.read(self.bytes_to_read)

            logger.debug("Sniffing for dialect; '{}'.".format(delimited_filepath))
            try:
                if self.poss_delimiters:
                    sniffed = Sniffer().sniff(working_text, delimiters=self.poss_delimiters)
                else:
                    sniffed = Sniffer().sniff(working_text)
                    if len(sniffed.delimiter) != 1 or len(sniffed.quotechar) != 1:
                        poss_delimiters = '|\t,'
                        sniffed = Sniffer().sniff(working_text,
                                                  delimiters=poss_delimiters)
            except csv.Error as csv_error:
                logger.error("There is something wrong with '{}'.".format(delimited_filepath))
                raise csv.Error(csv_error)

        return sniffed
예제 #2
0
def lb_fs_monitor(base_dir: str,
                  log_dir: str,
                  logging_lvl: str='INFO',
                  log_type: str="fs_monitor") -> None:
    """
    Configure logging via logbook for assorted File System Monitoring processes.
    *logging_level is for setting the level for logs written to file as
    the StreamHandler via stdout logs all records by default (Passing stream=False
    will stop any logs to going from standard out).
    """

    fmt_string = ("[{record.time:%Y-%m-%d %H:%M:%S}] - "
                  "{record.level} - "
                  "{record.func_name} - "
                  "{record.message}")
    logbook.set_datetime_format("local")

    veribuild_dir(log_dir)
    log_file = "{}_{}.log".format(split(base_dir)[-1], log_type)
    log_path = join_n_norm(log_dir, log_file)
    file_handler = logbook.TimedRotatingFileHandler(log_path,
                                                    level=logging_lvl,
                                                    format_string=fmt_string,
                                                    backup_count=21)

    stream_handler = logbook.StreamHandler(stdout,
                                           level=logging_lvl,
                                           format_string=fmt_string,
                                           bubble=True)

    file_handler.push_application()
    stream_handler.push_application()
예제 #3
0
def lb_pp(log_dir: str, in_file: str, logging_level: str, log_type: str="pp") -> None:
    """
    Configure logging via logbook for assorted preprocessing scripts.
    *logging_level is for setting the level for logs written to file as
    the StreamHandler via stdout logs all records (set to DEBUG).
    """

    logbook.set_datetime_format("local")

    veribuild_dir(log_dir)

    log_file = "{}_{}.log".format(in_file, log_type)
    log_path = join_n_norm(log_dir, "{}".format(log_file))

    fmt_string = ("[{record.time:%Y-%m-%d %H:%M:%S}] - "
                  "level:{record.level} - "
                  "filename:{record.filename} - "
                  "line:{record.lineno} - "
                  "function:{record.func_name} - "
                  "message:{record.message}")

    timed_rot_fh = logbook.TimedRotatingFileHandler(log_path,
                                                    level=logging_level,
                                                    format_string=fmt_string,
                                                    backup_count=21)
    stream_h = logbook.StreamHandler(stdout,
                                     format_string=fmt_string,
                                     bubble=True)

    timed_rot_fh.push_application()
    stream_h.push_application()
예제 #4
0
    def sniff_for_encoding(self, delimited_file_path: str) -> str:
        """Sniff file to try and determine and return encoding else returns latin-1."""

        working_filepath = join_n_norm(delimited_file_path)
        verify_filepath(working_filepath)

        detector = UniversalDetector()

        with open(working_filepath, 'rb') as working_file:
            logger.debug("Sniffing for encoding; '{}'".format(delimited_file_path))
            for chunk in range(self.encoding_chunks_to_read):
                if not detector.done:
                    detector.feed(working_file.read(4096))
                else:
                    break
            detector.close()
            if not detector.done:
                encoding = "latin-1"
                logger.debug("Encoding not detected; set to latin-1.")
            else:
                encoding = detector.result['encoding']
                logger.debug("Detected encoding; returning '{}'".format(encoding))

        return encoding
예제 #5
0
def integrity_check(infile_path: str) -> str:
    """
    Take filepath and report on the number of columns detected per line, extra quotechars,
    etc. Helps to detect problems in source files which may cause issues when creating
    schema files and indicator for whether file will need preprocessor.
    """

    working_filepath = join_n_norm(infile_path)
    assert verify_filepath(working_filepath)

    dd = DialectDetector()
    has_headers = csv.Sniffer().has_header(working_filepath)
    dialect = dd.sniff_for_dialect(working_filepath)
    encoding = dd.sniff_for_encoding(working_filepath)

    quotes_per_line = list()
    columns_per_line = list()
    output = list()

    output.append("Filename: '{}'".format(split(working_filepath)[-1]))
    output.append("Detected Field Delimiter: [{}]".format(dialect.delimiter))
    output.append("Detected Text Delimiter: [{}]".format(dialect.quotechar))
    output.append("Checking for even # of Text Delimiters on every line...")

    with open(working_filepath, "rt", encoding=encoding) as infile:
        reader = infile.readlines()
        uneven_tdelimiter_list = list()

        for n, line in enumerate(reader, start=1):
            quotes = Counter(line)[dialect.quotechar]
            quotes_per_line.append(quotes)

            if quotes % 2 != 0:
                uneven_tdelimiter_list.append(
                    "Line {} = {} [{}]'s".format(n, quotes, dialect.quotechar))

        if uneven_tdelimiter_list:
            output.append("FAILED: lines with uneven number of text delimiters detected.")
            output.append("PROBLEM LINES:")
            output.append("\n".join(uneven_tdelimiter_list))
            output.append("TEXT DELIMITER DISPERSAL:")
            quote_counter = Counter(quotes_per_line)
            quote_dispersal = list()
            for n in quote_counter.most_common():
                quote_dispersal.append(
                    "{} lines = {} text delimiters.".format(n[1], n[0]))
            output.append("\n".join(quote_dispersal))
        else:
            output.append("PASSED: NO lines with uneven text delimiter count found.")

    output.append("Checking for the same number of columns/fields on every line...")
    with open(working_filepath, "rt", encoding=encoding) as infile:
        reader = csv.reader(infile, dialect=dialect)
        headers = next(reader)
        if not has_headers:
            reader.seek(0)

        header_column_count = len(headers)

        output.append("Columns in header/first row = {}".format(header_column_count))

        problem_line_numbers = list()
        bad_column_count = list()

        for n, line in enumerate(reader):
            column_count = len(line)
            columns_per_line.append(column_count)

            if header_column_count != column_count:
                bad_column_count.append(
                    "Line {} = {} columns/fields.".format(n, column_count))
                problem_line_numbers.append(str(n))

    if not problem_line_numbers:
        output.append("PASSED: All lines have consistent column/field count.")
    else:
        output.append("FAILED: lines with different number of columns/fields detected.")
        output.append("PROBLEM LINES:")
        output.append("\n".join(bad_column_count))
        output.append("COLUMN/FIELD DISPERSAL:")
        column_counter = Counter(columns_per_line)
        column_dispersal = list()
        for n in column_counter.most_common():
            column_dispersal.append("{} lines = {} columns/fields.".format(
                n[1], n[0]))
        output.append("\n".join(column_dispersal))
    return "\n".join(output)
예제 #6
0
from types import GeneratorType

from csv_tools import (DialectDetector, SchemaEngine, integrity_check)
from file_tools import join_n_norm
from lb_configs.logbook_config import lb_streaming


__author__ = "cwandrews"

logger = Logger()
lb_streaming("INFO")


W_DIR = dirname(__file__)

INV_TEST_REG = join_n_norm(W_DIR, "static", "inv_test.csv")
INV_TEST_NH = join_n_norm(W_DIR, "static", "inv_test_headers_none.csv")
INV_TEST_PRO = join_n_norm(W_DIR, "static", "inv_test_processed.csv")
INV_TEST_PIPE = join_n_norm(W_DIR, "static", "inv_test_pipe.csv")
INV_TEST_PIPE2 = join_n_norm(W_DIR, "static", "maxie_price_rv_inv.csv")
INV_TEST_WH = join_n_norm(W_DIR, "static", "inv_test_headers_weird.csv")
INV_TEST_TAB = join_n_norm(W_DIR, "static", "inv_test_tab.csv")
INV_TEST_BAD = join_n_norm(W_DIR, "static", "feed.csv")

INTEG_RET = join_n_norm(W_DIR, "static", "tfi_feeds.txt")


@pytest.fixture(scope="function")
def test_file(request):

    temp_file = NamedTemporaryFile(delete=False)