def parse_and_save_smdr_data(): all_files = get_all_files(ATTACHMENTS_PATH) logger.debug(f"GOT {len(all_files)} files to parse") for file in all_files[:5]: if file.endswith("slk"): # only parse slk files - ignore gzipped files file_path = os.path.join(ATTACHMENTS_PATH, file) parser = SylkParser(file_path) fbuf = StringIO() parser.to_csv(fbuf) data = fbuf.getvalue().split("\n") for line in data: print(line)
def test_stream_unicode(): fpath = get_data_path("balance_analytique_cp1252.slk") parser = SylkParser(fpath, use_unicode=True) for line in parser: for i in line: assert isinstance(i, unicode) parser = SylkParser(fpath, use_unicode=False) for line in parser: for i in line: assert isinstance(i, str)
def _test_one(test_filename, expected_results_filename, headers=None, encoding='cp1252'): filepath = get_data_path(test_filename) expected_results_filepath = get_data_path(expected_results_filename) if headers is None: headers = [] parser = SylkParser(filepath, headers=headers, encoding=encoding) fbuf = StringIO() parser.to_csv(fbuf) test_results = fbuf.getvalue() with open(expected_results_filepath) as handle: expected_results = handle.read() assert test_results.strip() == expected_results.strip() print("Tested {}".format(test_filename))
import sys from sylk_parser import SylkParser # argv1: source file # argv2: output file parser = SylkParser(sys.argv[1]) with open(sys.argv[2], "wb") as fbuf: parser.to_csv(fbuf)
def parse_xls(xls): worksheet = SylkParser(xls) SUBTABLE_START = "AREA" SUBTABLE_END = "Total" SUBTABLE_EMPTY = "Tabla vac" area_row = None dataset = {} cross_cols = None current_index = None previous_row = None for i, row in enumerate(worksheet): first_col = str(row[0]).strip() if area_row is None: if first_col.startswith(SUBTABLE_START): number = int( re.match(fr"{SUBTABLE_START}\s*#\s*(\d+)\s*", first_col).group(1)) current_index = number name = row[1] area_row = i header_row = None else: if first_col == SUBTABLE_END or first_col.startswith( SUBTABLE_EMPTY): if current_index not in dataset: dataset[current_index] = {} area_row = None elif i > area_row + 2 and first_col: # actual rows if current_index not in dataset: dataset[current_index] = {} header_row = previous_row first_header = str(header_row[1]).strip() if first_header != 'Casos': cross_cols = [] while str(header_row[ 1 + len(cross_cols)]).strip() != 'Total': cross_cols.append( '.' + format_col(header_row[1 + len(cross_cols)])) else: cross_cols = [''] for j, col in enumerate(cross_cols): value = row[1 + j] if not value: print(f"cannot parse row {row}") exit(1) dataset[current_index][format_col(first_col) + col] = value previous_row = row if len(dataset) < 100: return None dataset_fixed = {INDEX: []} for area, cols in dataset.items(): for col in cols: dataset_fixed[col] = [] for area, cols in dataset.items(): dataset_fixed[INDEX].append(area) dataset[area] = {col: vals for col, vals in cols.items()} for col in dataset_fixed.keys(): if col != INDEX: value = dataset[area][col] if col in dataset[area] else '0' value = int(format_col(value.replace('-', '0'))) dataset_fixed[col].append(value) dataset = dataset_fixed return dataset
#!/usr/bin/env python # Hmmm. doesn't seem to work from io import StringIO from sylk_parser import SylkParser parser = SylkParser("loop.oleo") fbuf = StringIO() parser.to_csv(fbuf) test_results = fbuf.getvalue() print test_results