def getStatusCDR(service): cdr_name_file = "R-{}-{}-{}-{:08d}.zip".format( service.empresa.ruc, service.tipo_doc.codigo, service.serie, service.numero, ) rpt = service.getStatusCdr() paths = getConfigData("paths")["paths"] full_path = "" if rpt.content: full_path = saveBinaryFile( rpt.content, paths["destino_CDR_file"], cdr_name_file) data = [ ["Code", rpt.statusCode], ["Message", rpt.statusMessage], ] if rpt.content: data.append(["Ubicación", full_path]) printSingleTable(data, " Respuesta CDR: ", False)
def getListaTiposDocs(): lista_tipos_docs = [] data = getConfigData("tipos_docs") if len(data["tipos_docs"]) > 0: for tipo_doc in data["tipos_docs"]: tdoc = TipoDocumento( tipo_doc["codigo"], tipo_doc["descripcion"] ) lista_tipos_docs.append(tdoc) else: printOnConsole("No existen Tipos de documentos configurados.", "e") exit() return lista_tipos_docs
def getListaEmpresas(): lista_empresas = [] data = getConfigData("empresas") if len(data["empresas"]) > 0: for empresa in data["empresas"]: emp = Empresa( empresa["razon_social"], empresa["ruc"], ClaveSol( empresa["clave_sol"]["usuario"], empresa["clave_sol"]["contrasenha"] ) ) lista_empresas.append(emp) else: printOnConsole("No existen empresas configuradas.", "e") exit() return lista_empresas
def cleanMaster(): config = hp.getConfigData() checksumClean = hp.getChecksumClean() # load path constants defined in config.yaml RAW_DIR = config['dirHeaders']['raw_dir'] CLEAN_DIR = config['dirHeaders']['cleaned_dir'] TR_DIR = config['dirHeaders']['transformed_dir'] DATA_FTYPE = config['dataFileType'] # define template for the absolute file path (defined in config.yaml) sf_template = Template('$base$type$fname$ftype') mf_template = Template('$base$type$fname$year$ftype') # load the state abbreviation map abbrevMap = hp.getAbbreviationMap() # f_checksum = open('checksum_CLEAN.txt', 'w') for s in config['datasets']: try: ds = s['set'] print("Processing dataset: {:s}".format(ds['name'])) if ds['single_file']: absPath = sf_template.substitute(base=ds['directory'], type=RAW_DIR, fname=ds['file_base'], ftype=DATA_FTYPE) outPath = sf_template.substitute(base=ds['directory'], type=CLEAN_DIR, fname=ds['file_base'], ftype=DATA_FTYPE) # BREAK EARLY IF FILE ALREADY PROCESSED if (not hp.goAheadForClean(absPath)): continue print("Loading: {:s}".format(absPath)) rawData = pd.read_csv(absPath) if ds['loc_single_column']: # split the single column # Add 'State' column rawData['State'] = np.zeros(rawData.shape[0]) rawData = rawData.rename(columns={'Location': 'County'}) # Rearrange columns. [State, County, .... , rest] cols = rawData.columns.tolist() cols.pop(cols.index('State')) cols.pop(cols.index('County')) cols = ['State', 'County'] + cols rawData = rawData[cols] # county value must be non-null rawData = rawData.dropna(thresh=1, subset=['County']) # split the single column for idx, row in rawData.iterrows(): tmp = row['County'].split(',') row['County'] = tmp[0] if (len(tmp) > 1): row['State'] = tmp[1] else: # matches either STATE, UNITED STATES, District of Columbia row['State'] = "z_NA" rawData.loc[idx] = row # Convert county list to uppercase for idx, row in rawData.iterrows(): row['County'] = row['County'].upper() rawData.loc[idx] = row # drop raw duplicates rawData.drop_duplicates(subset=['State', 'County'], inplace=True) # clean State and County values for idx, row in rawData.iterrows(): if (row['State'] != 'z_NA'): row['State'] = row['State'].upper().replace( "COUNTY", "").replace("PARISH", "").replace( "'", "").replace("CITY", "").replace( ".", "").replace(",", "").strip() row['County'] = row['County'].upper().replace( "COUNTY", "").replace("PARISH", "").replace("'", "").replace( "CITY", "").replace(".", "").replace(",", "").strip() rawData.loc[idx] = row else: # data is already stored in "State" and "County" # county AND state value must be non-null rawData = rawData.dropna(thresh=2, subset=['County', 'State']) # drop raw duplicates rawData.drop_duplicates(subset=['State', 'County'], inplace=True) # Rearrange columns. [State, County, .... , rest] cols = rawData.columns.tolist() cols.pop(cols.index('State')) cols.pop(cols.index('County')) cols = ['State', 'County'] + cols rawData = rawData[cols] for idx, row in rawData.iterrows(): if (ds['fips_flag'] & row['FIPS'] == 0): row['State'] = 'z_NA' else: row['State'] = row['State'].upper().strip() row['County'] = row['County'].upper().replace( "COUNTY", "").replace("PARISH", "").replace("'", "").replace( "CITY", "").replace(".", "").replace(",", "").strip() rawData.loc[idx] = row rawData['State'] = rawData['State'].map(abbrevMap, na_action='ignore') rawData = rawData.sort_values(['State', 'County'], axis=0) print("{:s} cleaned. Outputting to: {:s}".format( ds['name'], outPath)) rawData.to_csv(outPath, index=False) # Write to checksum file checksumClean['processedFiles'][absPath] = rawData.shape[0] else: # Handle multiple files for year in hp.yearList(ds['year_start'], ds['year_end'], ds['year_increment'], ds['years_absent']): absPath = mf_template.substitute(base=ds['directory'], type=RAW_DIR, fname=ds['file_base'], year=year, ftype=DATA_FTYPE) outPath = mf_template.substitute(base=ds['directory'], type=CLEAN_DIR, fname=ds['file_base'], year=year, ftype=DATA_FTYPE) # BREAK EARLY IF FILE ALREADY PROCESSED if (not hp.goAheadForClean(absPath)): continue print("Loading: {:s}".format(absPath)) rawData = pd.read_csv(absPath) if ds['loc_single_column']: # add 'State' column rawData['State'] = np.zeros(rawData.shape[0]) rawData = rawData.rename( columns={'Location': 'County'}) # Rearrange columns. [State, County, .... , rest] cols = rawData.columns.tolist() cols.pop(cols.index('State')) cols.pop(cols.index('County')) cols = ['State', 'County'] + cols rawData = rawData[cols] # county value must be non-null rawData = rawData.dropna(thresh=1, subset=['County']) # split the single column for idx, row in rawData.iterrows(): tmp = row['County'].split(',') row['County'] = tmp[0] if (len(tmp) > 1): row['State'] = tmp[1] else: # matches either STATE, UNITED STATES, District of Columbia row['State'] = "z_NA" rawData.loc[idx] = row # Convert county list to uppercase for idx, row in rawData.iterrows(): row['County'] = row['County'].upper() rawData.loc[idx] = row # drop raw duplicates rawData.drop_duplicates(subset=['State', 'County'], inplace=True) # clean State and County values for idx, row in rawData.iterrows(): if (row['State'] != 'z_NA'): row['State'] = row['State'].upper().replace( "COUNTY", "").replace("PARISH", "").replace( "'", "").replace("CITY", "").replace( ".", "").replace(",", "").strip() row['County'] = tmp[0].upper().replace( "COUNTY", "").replace("PARISH", "").replace( "'", "").replace("CITY", "").replace( ".", "").replace(",", "").strip() rawData.loc[idx] = row else: # county AND state value must be non-null rawData = rawData.dropna(thresh=2, subset=['County', 'State']) # drop raw duplicates rawData.drop_duplicates(subset=['State', 'County'], inplace=True) # Rearrange columns. [State, County, .... , rest] cols = rawData.columns.tolist() cols.pop(cols.index('State')) cols.pop(cols.index('County')) cols = ['State', 'County'] + cols rawData = rawData[cols] # data is already stored in "State" and "County" for idx, row in rawData.iterrows(): if (ds['fips_flag'] and row['FIPS'] == 0): row['State'] = 'z_NA' elif (row['State'] == 'z_NA'): continue else: row['State'] = row['State'].upper().strip() row['County'] = row['County'].upper().replace( "COUNTY", "").replace("PARISH", "").replace( "'", "").replace("CITY", "").replace( ".", "").replace(",", "").strip() rawData.loc[idx] = row rawData['State'] = rawData['State'].map(abbrevMap, na_action='ignore') rawData = rawData.sort_values(['State', 'County'], axis=0) print("{:s} cleaned. Outputting to: {:s}".format( ds['name'], outPath)) rawData.to_csv(outPath, index=False) # Write to checksum file checksumClean['processedFiles'][absPath] = rawData.shape[0] except Exception as e: print(e) # f_checksum.write("ERROR cleaning dataset: {:s}\n".format(ds['name'])) print("ERROR cleaning dataset: {:s}\n".format(ds['name'])) # f_checksum.write('Timestamp: {:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now())) # f_checksum.close() # Update the clean checksum file hp.setChecksumClean(checksumClean)
def transformMaster(cityMasterList): config = hp.getConfigData() checksumTransform = hp.getChecksumTransform() # load path constants defined in config.yaml RAW_DIR = config['dirHeaders']['raw_dir'] CLEAN_DIR = config['dirHeaders']['cleaned_dir'] TR_DIR = config['dirHeaders']['transformed_dir'] DATA_FTYPE = config['dataFileType'] # define template for the absolute file path (defined in config.yaml) sf_template = Template('$base$type$fname$ftype') mf_template = Template('$base$type$fname$year$ftype') # load the set of counties with "CITY" masterList = pd.read_csv(cityMasterList) # Get the dictionary representation of the master county list MASTER_COUNTY_SET = hp.getDictionarySet(cityMasterList) # open the checksum file descriptor f_checksum = open('checksum_TRANSFORM.txt', 'w') for s in config['datasets']: try: ds = s['set'] print("Processing dataset: {:s}".format(ds['name'])) if ds['single_file']: print("Single file\n") # Set up input / output paths absPath = sf_template.substitute(base=ds['directory'], type=CLEAN_DIR, fname=ds['file_base'], ftype=DATA_FTYPE) outPath = sf_template.substitute(base=ds['directory'], type=TR_DIR, fname=ds['file_base'], ftype=DATA_FTYPE) yRange = range(ds['year_start'], ds['year_end'] + ds['year_increment'], ds['year_increment']) yRangeStr = [str(y) for y in yRange] # BREAK EARLY IF FILE ALREADY PROCESSED if (not hp.goAheadForTransform(absPath, cityMasterList)): continue print("Loading: {:s}".format(absPath)) rd = pd.read_csv(absPath) # Only include "State", "County", and any "YEAR in range" columns labels = ["State", "County"] + yRangeStr # Only include columns defined in the configuration file for col in rd.columns: if col not in labels: rd.drop(col, axis=1, inplace=True) # pad the dataframe with the master county list rd = padDataframe(rd, masterList) # Marker to indicate if row should be dropped rd['DROP_ROW'] = np.zeros(rd.shape[0]) # Drop all rows not containg city/state pair in the master set for idx, row in rd.iterrows(): if (MASTER_COUNTY_SET.get(row['County']) == None): # print(row['County']) rd.set_value(idx, 'DROP_ROW', 1) else: if (not (row['State'] in MASTER_COUNTY_SET.get( row['County']))): # print(row['County']) rd.set_value(idx, 'DROP_ROW', 1) # Only keep rows not slated to be dropped rd = rd[rd['DROP_ROW'] == 0] rd.drop('DROP_ROW', axis=1, inplace=True) # Output the transformed data to file print("{:s} transformed. Outputting to: {:s}".format( ds['name'], outPath)) rd.to_csv(outPath, index=False, line_terminator=",\n") # section to write the checksum file for county in MASTER_COUNTY_SET.keys(): for state in MASTER_COUNTY_SET[county]: if (rd[(rd['State'] == state) & (rd['County'] == county)].shape[0] == 0): f_checksum.write("{:s} | {:s}\n".format( county, state)) # f_checksum.write("{:d} Counties\n\n".format(rd.shape[0])) checksumTransform['processedFiles'][absPath] = rd.shape[0] else: print("Multiple files\n") for year in hp.yearList(ds['year_start'], ds['year_end'], ds['year_increment'], ds['years_absent']): absPath = mf_template.substitute(base=ds['directory'], type=CLEAN_DIR, fname=ds['file_base'], year=year, ftype=DATA_FTYPE) outPath = mf_template.substitute(base=ds['directory'], type=TR_DIR, fname=ds['file_base'], year=year, ftype=DATA_FTYPE) # BREAK EARLY IF FILE ALREADY PROCESSED if (not hp.goAheadForTransform(absPath, cityMasterList)): continue print("Loading: {:s}".format(absPath)) rd = pd.read_csv(absPath) # Only include columns defined in the configuration file for col in rd.columns: if col not in ds['data_labels']: rd.drop(col, axis=1, inplace=True) rd = padDataframe(rd, masterList) # Marker to indicate if row should be dropped rd['DROP_ROW'] = np.zeros(rd.shape[0]) # Drop all rows not containg city/state pair in the master set for idx, row in rd.iterrows(): if (MASTER_COUNTY_SET.get(row['County']) == None): rd.set_value(idx, 'DROP_ROW', 1) # rd.drop(idx, inplace=True) else: if (not (row['State'] in MASTER_COUNTY_SET.get( row['County']))): rd.set_value(idx, 'DROP_ROW', 1) # rd.drop(idx, inplace=True) # Only keep rows not slated to be dropped rd = rd[rd['DROP_ROW'] == 0] rd.drop('DROP_ROW', axis=1, inplace=True) # Output the transformed data to file print("{:s} transformed. Outputting to: {:s}".format( ds['name'], outPath)) rd.to_csv(outPath, index=False, line_terminator=",\n") # section to write the checksum file for county in MASTER_COUNTY_SET.keys(): for state in MASTER_COUNTY_SET[county]: if (rd[(rd['State'] == state) & (rd['County'] == county)].shape[0] == 0): f_checksum.write("{:s} | {:s}\n".format( county, state)) checksumTransform['processedFiles'][absPath] = rd.shape[0] # f_checksum.write("Finished transforming dataset: {:s}\n\n".format(ds['name'])) # f_checksum.write("---------------------------------------------------\n") except Exception as e: print(e) # f_checksum.write("ERROR transforming dataset: {:s}\n".format(ds['name'])) # f_checksum.write('Timestamp: {:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now())) # f_checksum.close() hp.setChecksumTransform(checksumTransform) # Update the "previousRun.yaml" file prevRun = hp.getPreviousRunData() prevRun['masterCountyListFile'] = cityMasterList prevRun['numCountiesInMaster'] = masterList.shape[0] hp.setPreviousRunData(prevRun)
import helpers as hp from string import Template import pandas as pd import numpy as np import datetime config = hp.getConfigData() # load path constants defined in config.yaml RAW_DIR = config['dirHeaders']['raw_dir'] CLEAN_DIR = config['dirHeaders']['cleaned_dir'] TR_DIR = config['dirHeaders']['transformed_dir'] DATA_FTYPE = config['dataFileType'] # define template for the absolute file path (defined in config.yaml) sf_template = Template('$base$type$fname$ftype') mf_template = Template('$base$type$fname$year$ftype') # load the set of counties with "CITY" cityList = pd.read_csv('CountyLists/citySet.csv') baseList = pd.read_csv('CountyLists/baseSet.csv') # get the master set of counties to check against MASTER_COUNTY_SET = hp.getDictionarySet() # open the checksum file descriptor f_checksum = open('checksum_TRANSFORM.txt', 'w') for s in config['datasets']: try: ds = s['set']