def process_file(fname, schemafile):
    table = None
    for line in open(fname):
        if not table:
            m = title_re.search(line)
            if m:
                table = Table(name=m.group(1))
        if table and not table.version:
            m = version_re.search(line)
            if m:
                table.version = m.group(1)
        m = variable_re.search(line)
        if m:
            v = Variable()
            (position, name, desc, vtype, column) = m.group(1, 2, 3, 4, 5)
            oname = name
            count = 2
            while name in [v.name for v in table.vars]:
                name = "{}{}".format(oname, count)
                count += 1
            v.define_from_row([position, name, desc, vtype])
            if "-" in column:
                v.column = [int(x) for x in column.split("-")]
            table.add_variable(v)

    schemafile.write('#define SPEC_FILE "{}"\n'.format(fname))
    table.write_sql_scanner(schemafile)
示例#2
0
 def load_table_schema_from_census_xlsx_file(self, *, filename, prefix=""):
     """Read multiple tables from a xlsx file. Only ingest the tables that have the prefix. We only look at the first 5 columns"""
     from openpyxl import load_workbook
     from openpyxl.cell.read_only import EmptyCell
     wb = load_workbook(filename=filename, read_only='True')
     for ws in wb.worksheets:
         print("Processing worksheet {}".format(ws.title))
         if not self.verify_census_worksheet(ws):
             logging.error(
                 "This does not appear to be a US Census Bureau XLSX specification file"
             )
             raise RuntimeError(
                 "This does not appear to be a US Census Bureau XLSX specification file"
             )
         column = 0
         table = Table(name=ws.title)
         self.add_table(table)
         header_skipped = False
         for row in ws.iter_rows():
             # Skip empty rows
             if type(row[0]) == EmptyCell:
                 continue
             # skip the header
             if header_skipped == False:
                 header_skipped = True
                 continue
             # Only the first five columns are considered
             values = []
             for i in range(0, 5):
                 if type(row[i]) == EmptyCell:
                     values.append("")
                 elif type(row[i].value) == str:
                     values.append(row[i].value.strip())
                 else:
                     values.append(row[i].value)
             (column_id, column_name, oracle_datatype, desc,
              legal_values) = values
             if (not column_id) and not (legal_values):
                 # Ignore the blank line
                 continue
             if column_id:
                 if type(column_id) != int:
                     logging.info("Column ID is invalid; skipping")
                     continue
                 v = Variable(position=column_id,
                              name=column_name,
                              desc=desc,
                              vtype=oracle_datatype,
                              column=column)
                 # advance to the next column
                 column += v.width
                 table.add_variable(v)
             if v and legal_values:
                 for possible_legal_value in legal_values.split("\n"):
                     r = Range.extract_range_and_desc(possible_legal_value)
                     if r:
                         v.add_range(r)
示例#3
0
 def process_layout_line(*, table, line):
     m = ipums_layout_re.search(line)
     try:
         (variable_name, start, end) = m.group(1, 2, 3)
     except AttributeError:
         raise RuntimeError("Cannot parse: " + line)
     (name, start, end) = m.group(1, 2, 3)
     v = Variable(name=name, vtype=TYPE_VARCHAR)
     v.set_column(int(start) - 1, int(end) - 1)
     table.add_variable(v)
示例#4
0
 def mutate(self, info, action="PUT", options=None):
     if action == "PUT":
         StoreProxy.store().Variable.put(options)
     elif action == "DELETE":
         StoreProxy.store().Variable.delete(options["id"])
     else:
         raise Exception("Unsupported mutation action")
     variable = Variable(**options)
     return VariableMutation(variable=variable)
示例#5
0
    def load_table_schema_from_census_txt_spec(self, *, filename, prefix=""):
        """Read a single table from a txt file."""
        table = None
        for (ll, line) in enumerate(dopen(filename), 1):
            if ll > self.MAXLINES:
                if (table == None) or len(table.vars()) == 0:
                    logging.info(
                        "{} is not a Census text specification".format(
                            filename))
                    return None

            # Get a table name if we do not have one
            if not table:
                m = TXT_TITLE_RE.search(line)
                if m:
                    table = Table(name=m.group(1))
                    table.add_comment("Parsed from {}".format(filename))
                    continue
            # Get the table version if we do not have one
            if table and not table.version:
                m = TXT_VERSION_RE.search(line)
                if m:
                    table.version = m.group(1)
                    continue
            # Is this a variable name within the table?
            m = VARIABLE_RE.search(line)
            if m:
                (position, name, desc, vtype, column) = m.group(1, 2, 3, 4, 5)
                oname = name
                count = 2
                v = Variable(position=row[0],
                             name=row[1],
                             desc=row[2],
                             vtype=row[3])
                while name in [v.name for v in table.vars()]:
                    name = "{}{}".format(oname, count)
                    count += 1
                if "-" in column:
                    v.column = [int(x) for x in column.split("-")]
                table.add_variable(v)
        if len(table.vars()) == 0:
            return None
        self.add_table(table)
示例#6
0
class ControlsHandler(CSVFile):

    skip_header = True
    has_header = True
    class_csv_guess = True

    schema = {
        'number': Unicode(mandatory=True, title=MSG(u"Number")),
        'title': Unicode(mandatory=True, title=MSG(u"Title")),
        'expression': Expression(mandatory=True, title=MSG(u"Expression")),
        'level': ControlLevel(mandatory=True, title=MSG(u"Level")),
        'variable': Variable(mandatory=True, title=MSG(u"Main Variable"))
    }

    columns = ['number', 'title', 'expression', 'level', 'variable']
示例#7
0
    def load_table_from_docx_table(docx_table):
        """Returns a table name and the list of the variables"""
        tableName = docx_table[0][0].replace(" ", "_").replace(".", "_")
        if CensusSpec.debug:
            print("DEBUG: Reading table '{}'".format(tableName))
        table = Table(name=tableName)
        v = None  # current variable
        for row in docx_table[1:]:  # don't process the first line
            cols = [x.replace("\n", " ").strip()
                    for x in row]  # change all newlines to spaces
            if sum([len(x) for x in cols]) == 0:  # blank row
                continue
            if CensusSpec.debug:
                print("DEBUG:  cols: {}".format(cols))
            if CensusSpec.is_variable_start(cols):
                if CensusSpec.debug:
                    print("DEBUG:    defining variable {}".format(cols[0]))
                v = Variable(position=cols[0],
                             name=cols[1],
                             desc=cols[2],
                             vtype=cols[3])

                # if there is a fifth column, it may be allowable values
                if len(cols) == 5:
                    v.add_valid_data_description(cols[4])
                table.add_variable(v)
                continue

            # If defining a variable and we have an extra cols, it may have an allowable value
            # in the cols[2] or cols[4]
            if v:
                if len(cols) > 2 and len(cols[2]):
                    v.add_valid_data_description(cols[2])
                    contine
                if len(cols) > 4 and len(cols[4]):
                    v.add_valid_data_description(cols[4])
                    continue
                print("Not sure what to do with this:", cols)
                assert False
        return table
示例#8
0
def process_file(fname, dbfile):

    data = pandas.read_sas(fname, chunksize=1)
    frame = next(data)

    # Make a table
    table = Table(name=os.path.splitext(os.path.split(fname)[1])[0])
    logging.info("Creating table {}".format(table.name))
    for col in frame.columns:
        v = Variable()
        v.set_name(col)
        v.set_vtype(schema.vtype_for_numpy_type(type(frame[col][0])))
        table.add_variable(v)

    conn = sqlite3.connect(dbfile)
    c = conn.cursor()
    cmd = table.sql_schema()
    c.execute(cmd)

    t0 = time.time()
    logging.info("Transferring data...")
    istmt = table.sql_insert()
    print(istmt)
    lines = 0
    for frame in pandas.read_sas(fname, chunksize=CHUNKSIZE):
        c.execute("BEGIN TRANSACTION;")
        for row in frame.itertuples(index=False):
            c.execute(istmt, row)
            lines += 1
            if lines % 10000 == 0:
                t = int(time.time() - t0)
                s = t % 60
                m = (t % 3600) // 60
                h = t // 3600
                logging.info("time: {}:{:02}:{:02} lines {:,}".format(
                    h, m, s, lines))
        c.execute("END TRANSACTION;")
示例#9
0
 def resolve_variables(self, info, entityid, offset=None, limit=None):
     return [Variable(**v) for v in StoreProxy.store().Variable.findByParent(parentid=entityid)]
示例#10
0
 def resolve_variable(self, *args, **kwargs):
     return Variable(**Query.store().Variable.findById(id=kwargs["id"]))