Python join_n_norm 예제들, file_tools.join_n_norm Python 예제들

예제 #1

0

파일 보기

파일: __init__.py 프로젝트: cw-andrews/master_tbox

    def sniff_for_dialect(self, delimited_file_path: str) -> csv.Dialect:
        """Sniff delimited file and return csv.Dialect."""

        delimited_filepath = join_n_norm(delimited_file_path)
        assert verify_filepath(delimited_file_path)

        sniffed_encoding = self.sniff_for_encoding(delimited_filepath)

        with open(delimited_filepath, "rt", encoding=sniffed_encoding) as ftt:
            working_text = ftt.read(self.bytes_to_read)

            logger.debug("Sniffing for dialect; '{}'.".format(delimited_filepath))
            try:
                if self.poss_delimiters:
                    sniffed = Sniffer().sniff(working_text, delimiters=self.poss_delimiters)
                else:
                    sniffed = Sniffer().sniff(working_text)
                    if len(sniffed.delimiter) != 1 or len(sniffed.quotechar) != 1:
                        poss_delimiters = '|\t,'
                        sniffed = Sniffer().sniff(working_text,
                                                  delimiters=poss_delimiters)
            except csv.Error as csv_error:
                logger.error("There is something wrong with '{}'.".format(delimited_filepath))
                raise csv.Error(csv_error)

        return sniffed

예제 #2

0

파일 보기

파일: logbook_config.py 프로젝트: cw-andrews/logbook_configs

def lb_fs_monitor(base_dir: str,
                  log_dir: str,
                  logging_lvl: str='INFO',
                  log_type: str="fs_monitor") -> None:
    """
    Configure logging via logbook for assorted File System Monitoring processes.
    *logging_level is for setting the level for logs written to file as
    the StreamHandler via stdout logs all records by default (Passing stream=False
    will stop any logs to going from standard out).
    """

    fmt_string = ("[{record.time:%Y-%m-%d %H:%M:%S}] - "
                  "{record.level} - "
                  "{record.func_name} - "
                  "{record.message}")
    logbook.set_datetime_format("local")

    veribuild_dir(log_dir)
    log_file = "{}_{}.log".format(split(base_dir)[-1], log_type)
    log_path = join_n_norm(log_dir, log_file)
    file_handler = logbook.TimedRotatingFileHandler(log_path,
                                                    level=logging_lvl,
                                                    format_string=fmt_string,
                                                    backup_count=21)

    stream_handler = logbook.StreamHandler(stdout,
                                           level=logging_lvl,
                                           format_string=fmt_string,
                                           bubble=True)

    file_handler.push_application()
    stream_handler.push_application()

예제 #3

0

파일 보기

파일: logbook_config.py 프로젝트: cw-andrews/logbook_configs

def lb_pp(log_dir: str, in_file: str, logging_level: str, log_type: str="pp") -> None:
    """
    Configure logging via logbook for assorted preprocessing scripts.
    *logging_level is for setting the level for logs written to file as
    the StreamHandler via stdout logs all records (set to DEBUG).
    """

    logbook.set_datetime_format("local")

    veribuild_dir(log_dir)

    log_file = "{}_{}.log".format(in_file, log_type)
    log_path = join_n_norm(log_dir, "{}".format(log_file))

    fmt_string = ("[{record.time:%Y-%m-%d %H:%M:%S}] - "
                  "level:{record.level} - "
                  "filename:{record.filename} - "
                  "line:{record.lineno} - "
                  "function:{record.func_name} - "
                  "message:{record.message}")

    timed_rot_fh = logbook.TimedRotatingFileHandler(log_path,
                                                    level=logging_level,
                                                    format_string=fmt_string,
                                                    backup_count=21)
    stream_h = logbook.StreamHandler(stdout,
                                     format_string=fmt_string,
                                     bubble=True)

    timed_rot_fh.push_application()
    stream_h.push_application()

예제 #4

0

파일 보기

파일: __init__.py 프로젝트: cw-andrews/master_tbox

    def sniff_for_encoding(self, delimited_file_path: str) -> str:
        """Sniff file to try and determine and return encoding else returns latin-1."""

        working_filepath = join_n_norm(delimited_file_path)
        verify_filepath(working_filepath)

        detector = UniversalDetector()

        with open(working_filepath, 'rb') as working_file:
            logger.debug("Sniffing for encoding; '{}'".format(delimited_file_path))
            for chunk in range(self.encoding_chunks_to_read):
                if not detector.done:
                    detector.feed(working_file.read(4096))
                else:
                    break
            detector.close()
            if not detector.done:
                encoding = "latin-1"
                logger.debug("Encoding not detected; set to latin-1.")
            else:
                encoding = detector.result['encoding']
                logger.debug("Detected encoding; returning '{}'".format(encoding))

        return encoding

예제 #5

0

파일 보기

파일: __init__.py 프로젝트: cw-andrews/master_tbox

def integrity_check(infile_path: str) -> str:
    """
    Take filepath and report on the number of columns detected per line, extra quotechars,
    etc. Helps to detect problems in source files which may cause issues when creating
    schema files and indicator for whether file will need preprocessor.
    """

    working_filepath = join_n_norm(infile_path)
    assert verify_filepath(working_filepath)

    dd = DialectDetector()
    has_headers = csv.Sniffer().has_header(working_filepath)
    dialect = dd.sniff_for_dialect(working_filepath)
    encoding = dd.sniff_for_encoding(working_filepath)

    quotes_per_line = list()
    columns_per_line = list()
    output = list()

    output.append("Filename: '{}'".format(split(working_filepath)[-1]))
    output.append("Detected Field Delimiter: [{}]".format(dialect.delimiter))
    output.append("Detected Text Delimiter: [{}]".format(dialect.quotechar))
    output.append("Checking for even # of Text Delimiters on every line...")

    with open(working_filepath, "rt", encoding=encoding) as infile:
        reader = infile.readlines()
        uneven_tdelimiter_list = list()

        for n, line in enumerate(reader, start=1):
            quotes = Counter(line)[dialect.quotechar]
            quotes_per_line.append(quotes)

            if quotes % 2 != 0:
                uneven_tdelimiter_list.append(
                    "Line {} = {} [{}]'s".format(n, quotes, dialect.quotechar))

        if uneven_tdelimiter_list:
            output.append("FAILED: lines with uneven number of text delimiters detected.")
            output.append("PROBLEM LINES:")
            output.append("\n".join(uneven_tdelimiter_list))
            output.append("TEXT DELIMITER DISPERSAL:")
            quote_counter = Counter(quotes_per_line)
            quote_dispersal = list()
            for n in quote_counter.most_common():
                quote_dispersal.append(
                    "{} lines = {} text delimiters.".format(n[1], n[0]))
            output.append("\n".join(quote_dispersal))
        else:
            output.append("PASSED: NO lines with uneven text delimiter count found.")

    output.append("Checking for the same number of columns/fields on every line...")
    with open(working_filepath, "rt", encoding=encoding) as infile:
        reader = csv.reader(infile, dialect=dialect)
        headers = next(reader)
        if not has_headers:
            reader.seek(0)

        header_column_count = len(headers)

        output.append("Columns in header/first row = {}".format(header_column_count))

        problem_line_numbers = list()
        bad_column_count = list()

        for n, line in enumerate(reader):
            column_count = len(line)
            columns_per_line.append(column_count)

            if header_column_count != column_count:
                bad_column_count.append(
                    "Line {} = {} columns/fields.".format(n, column_count))
                problem_line_numbers.append(str(n))

    if not problem_line_numbers:
        output.append("PASSED: All lines have consistent column/field count.")
    else:
        output.append("FAILED: lines with different number of columns/fields detected.")
        output.append("PROBLEM LINES:")
        output.append("\n".join(bad_column_count))
        output.append("COLUMN/FIELD DISPERSAL:")
        column_counter = Counter(columns_per_line)
        column_dispersal = list()
        for n in column_counter.most_common():
            column_dispersal.append("{} lines = {} columns/fields.".format(
                n[1], n[0]))
        output.append("\n".join(column_dispersal))
    return "\n".join(output)

예제 #6

0

파일 보기

파일: test_csv_tools.py 프로젝트: cw-andrews/master_tbox

from types import GeneratorType

from csv_tools import (DialectDetector, SchemaEngine, integrity_check)
from file_tools import join_n_norm
from lb_configs.logbook_config import lb_streaming


__author__ = "cwandrews"

logger = Logger()
lb_streaming("INFO")


W_DIR = dirname(__file__)

INV_TEST_REG = join_n_norm(W_DIR, "static", "inv_test.csv")
INV_TEST_NH = join_n_norm(W_DIR, "static", "inv_test_headers_none.csv")
INV_TEST_PRO = join_n_norm(W_DIR, "static", "inv_test_processed.csv")
INV_TEST_PIPE = join_n_norm(W_DIR, "static", "inv_test_pipe.csv")
INV_TEST_PIPE2 = join_n_norm(W_DIR, "static", "maxie_price_rv_inv.csv")
INV_TEST_WH = join_n_norm(W_DIR, "static", "inv_test_headers_weird.csv")
INV_TEST_TAB = join_n_norm(W_DIR, "static", "inv_test_tab.csv")
INV_TEST_BAD = join_n_norm(W_DIR, "static", "feed.csv")

INTEG_RET = join_n_norm(W_DIR, "static", "tfi_feeds.txt")


@pytest.fixture(scope="function")
def test_file(request):

    temp_file = NamedTemporaryFile(delete=False)