def process_file(fname, schemafile): table = None for line in open(fname): if not table: m = title_re.search(line) if m: table = Table(name=m.group(1)) if table and not table.version: m = version_re.search(line) if m: table.version = m.group(1) m = variable_re.search(line) if m: v = Variable() (position, name, desc, vtype, column) = m.group(1, 2, 3, 4, 5) oname = name count = 2 while name in [v.name for v in table.vars]: name = "{}{}".format(oname, count) count += 1 v.define_from_row([position, name, desc, vtype]) if "-" in column: v.column = [int(x) for x in column.split("-")] table.add_variable(v) schemafile.write('#define SPEC_FILE "{}"\n'.format(fname)) table.write_sql_scanner(schemafile)
def load_table_schema_from_census_xlsx_file(self, *, filename, prefix=""): """Read multiple tables from a xlsx file. Only ingest the tables that have the prefix. We only look at the first 5 columns""" from openpyxl import load_workbook from openpyxl.cell.read_only import EmptyCell wb = load_workbook(filename=filename, read_only='True') for ws in wb.worksheets: print("Processing worksheet {}".format(ws.title)) if not self.verify_census_worksheet(ws): logging.error( "This does not appear to be a US Census Bureau XLSX specification file" ) raise RuntimeError( "This does not appear to be a US Census Bureau XLSX specification file" ) column = 0 table = Table(name=ws.title) self.add_table(table) header_skipped = False for row in ws.iter_rows(): # Skip empty rows if type(row[0]) == EmptyCell: continue # skip the header if header_skipped == False: header_skipped = True continue # Only the first five columns are considered values = [] for i in range(0, 5): if type(row[i]) == EmptyCell: values.append("") elif type(row[i].value) == str: values.append(row[i].value.strip()) else: values.append(row[i].value) (column_id, column_name, oracle_datatype, desc, legal_values) = values if (not column_id) and not (legal_values): # Ignore the blank line continue if column_id: if type(column_id) != int: logging.info("Column ID is invalid; skipping") continue v = Variable(position=column_id, name=column_name, desc=desc, vtype=oracle_datatype, column=column) # advance to the next column column += v.width table.add_variable(v) if v and legal_values: for possible_legal_value in legal_values.split("\n"): r = Range.extract_range_and_desc(possible_legal_value) if r: v.add_range(r)
def process_layout_line(*, table, line): m = ipums_layout_re.search(line) try: (variable_name, start, end) = m.group(1, 2, 3) except AttributeError: raise RuntimeError("Cannot parse: " + line) (name, start, end) = m.group(1, 2, 3) v = Variable(name=name, vtype=TYPE_VARCHAR) v.set_column(int(start) - 1, int(end) - 1) table.add_variable(v)
def mutate(self, info, action="PUT", options=None): if action == "PUT": StoreProxy.store().Variable.put(options) elif action == "DELETE": StoreProxy.store().Variable.delete(options["id"]) else: raise Exception("Unsupported mutation action") variable = Variable(**options) return VariableMutation(variable=variable)
def load_table_schema_from_census_txt_spec(self, *, filename, prefix=""): """Read a single table from a txt file.""" table = None for (ll, line) in enumerate(dopen(filename), 1): if ll > self.MAXLINES: if (table == None) or len(table.vars()) == 0: logging.info( "{} is not a Census text specification".format( filename)) return None # Get a table name if we do not have one if not table: m = TXT_TITLE_RE.search(line) if m: table = Table(name=m.group(1)) table.add_comment("Parsed from {}".format(filename)) continue # Get the table version if we do not have one if table and not table.version: m = TXT_VERSION_RE.search(line) if m: table.version = m.group(1) continue # Is this a variable name within the table? m = VARIABLE_RE.search(line) if m: (position, name, desc, vtype, column) = m.group(1, 2, 3, 4, 5) oname = name count = 2 v = Variable(position=row[0], name=row[1], desc=row[2], vtype=row[3]) while name in [v.name for v in table.vars()]: name = "{}{}".format(oname, count) count += 1 if "-" in column: v.column = [int(x) for x in column.split("-")] table.add_variable(v) if len(table.vars()) == 0: return None self.add_table(table)
class ControlsHandler(CSVFile): skip_header = True has_header = True class_csv_guess = True schema = { 'number': Unicode(mandatory=True, title=MSG(u"Number")), 'title': Unicode(mandatory=True, title=MSG(u"Title")), 'expression': Expression(mandatory=True, title=MSG(u"Expression")), 'level': ControlLevel(mandatory=True, title=MSG(u"Level")), 'variable': Variable(mandatory=True, title=MSG(u"Main Variable")) } columns = ['number', 'title', 'expression', 'level', 'variable']
def load_table_from_docx_table(docx_table): """Returns a table name and the list of the variables""" tableName = docx_table[0][0].replace(" ", "_").replace(".", "_") if CensusSpec.debug: print("DEBUG: Reading table '{}'".format(tableName)) table = Table(name=tableName) v = None # current variable for row in docx_table[1:]: # don't process the first line cols = [x.replace("\n", " ").strip() for x in row] # change all newlines to spaces if sum([len(x) for x in cols]) == 0: # blank row continue if CensusSpec.debug: print("DEBUG: cols: {}".format(cols)) if CensusSpec.is_variable_start(cols): if CensusSpec.debug: print("DEBUG: defining variable {}".format(cols[0])) v = Variable(position=cols[0], name=cols[1], desc=cols[2], vtype=cols[3]) # if there is a fifth column, it may be allowable values if len(cols) == 5: v.add_valid_data_description(cols[4]) table.add_variable(v) continue # If defining a variable and we have an extra cols, it may have an allowable value # in the cols[2] or cols[4] if v: if len(cols) > 2 and len(cols[2]): v.add_valid_data_description(cols[2]) contine if len(cols) > 4 and len(cols[4]): v.add_valid_data_description(cols[4]) continue print("Not sure what to do with this:", cols) assert False return table
def process_file(fname, dbfile): data = pandas.read_sas(fname, chunksize=1) frame = next(data) # Make a table table = Table(name=os.path.splitext(os.path.split(fname)[1])[0]) logging.info("Creating table {}".format(table.name)) for col in frame.columns: v = Variable() v.set_name(col) v.set_vtype(schema.vtype_for_numpy_type(type(frame[col][0]))) table.add_variable(v) conn = sqlite3.connect(dbfile) c = conn.cursor() cmd = table.sql_schema() c.execute(cmd) t0 = time.time() logging.info("Transferring data...") istmt = table.sql_insert() print(istmt) lines = 0 for frame in pandas.read_sas(fname, chunksize=CHUNKSIZE): c.execute("BEGIN TRANSACTION;") for row in frame.itertuples(index=False): c.execute(istmt, row) lines += 1 if lines % 10000 == 0: t = int(time.time() - t0) s = t % 60 m = (t % 3600) // 60 h = t // 3600 logging.info("time: {}:{:02}:{:02} lines {:,}".format( h, m, s, lines)) c.execute("END TRANSACTION;")
def resolve_variables(self, info, entityid, offset=None, limit=None): return [Variable(**v) for v in StoreProxy.store().Variable.findByParent(parentid=entityid)]
def resolve_variable(self, *args, **kwargs): return Variable(**Query.store().Variable.findById(id=kwargs["id"]))