예제 #1
0
def change_fields_with_reserved_keywords(tdf, reserved_keywords, undo=False):
    tdf_schema = tdf.schema()
    mapping = {}
    for table, fields in tdf_schema.items():
        for fields_list in [fields[0], fields[1]]:
            for findex in range(len(fields_list)):
                original_field = fields_list[findex]
                if not undo:
                    verify(
                        not fields_list[findex].startswith('_'),
                        ("Field names cannot start with '_', in table %s : " +
                         "field is %s") % (table, fields_list[findex]))
                    if fields_list[findex].lower() in reserved_keywords:
                        fields_list[findex] = '_' + fields_list[findex]
                else:
                    if fields_list[findex].startswith('_'):
                        fields_list[findex] = fields_list[findex][1:]
                mapping[table, original_field] = fields_list[findex]
    rtn = ticdat.TicDatFactory(**tdf_schema)
    for (table, original_field), new_field in mapping.items():
        if original_field in tdf.default_values.get(table, ()):
            rtn.set_default_value(table, new_field,
                                  tdf.default_values[table][original_field])
        if original_field in tdf.data_types.get(table, ()):
            rtn.set_data_type(table, new_field,
                              *(tdf.data_types[table][original_field]))
    if hasattr(tdf, 'opl_prepend'):
        rtn.opl_prepend = tdf.opl_prepend
    if hasattr(tdf, 'ampl_prepend'):
        rtn.ampl_prepend = tdf.ampl_prepend
    if hasattr(tdf, 'lingo_prepend'):
        rtn.lingo_prepend = tdf.lingo_prepend
    return rtn
예제 #2
0
def find_duplicates_from_dict_ticdat(tdf, dict_ticdat):
    assert isinstance(tdf, ticdat.TicDatFactory)
    assert dictish(dict_ticdat) and all(map(stringish, dict_ticdat)) and \
           all(map(containerish, dict_ticdat.values()))
    primary_key_fields = {k: v for k, v in tdf.primary_key_fields.items() if v}
    if primary_key_fields:
        old_schema = {
            k: v
            for k, v in tdf.schema().items() if k in primary_key_fields
        }
        all_data_tdf = ticdat.TicDatFactory(
            **{t: [[], pks + dfs]
               for t, (pks, dfs) in old_schema.items()})
        td = all_data_tdf.TicDat(
            **
            {k: v
             for k, v in dict_ticdat.items() if k in primary_key_fields})
        rtn = {t: defaultdict(int) for t in primary_key_fields}
        for t, flds in list(primary_key_fields.items()):
            tbl = getattr(td, t)
            for row in tbl:
                k = tuple(row[f] for f in flds)
                k = k[0] if len(k) == 1 else k
                rtn[t][k] += 1
            rtn[t] = {k: v for k, v in rtn[t].items() if v > 1}
            if not rtn[t]:
                del (rtn[t])
        return rtn
예제 #3
0
def create_generic_free(td, tdf):
    assert tdf.good_tic_dat_object(td)
    if not tdf.generic_tables:
        return td, tdf
    sch = {
        k: v
        for k, v in tdf.schema().items() if k not in tdf.generic_tables
    }
    for t in tdf.generic_tables:
        if len(getattr(td, t)):
            sch[t] = [[], list(getattr(td, t).columns)]
    rtn_tdf = ticdat.TicDatFactory(**sch)
    return rtn_tdf.TicDat(**{t: getattr(td, t)
                             for t in rtn_tdf.all_tables}), rtn_tdf
    tl._forceguout()  # for testing purposes, using underscore is ok
    assert chk1 and chk2
    verify(
        sum(td.childTable[k]["dummy"]
            for k in tl.slice("*", "*", 3)) == chk1 * 2.)
    verify(
        sum(td.childTable[k]["dummy"]
            for k in tl.slice("*", 3, 2)) == chk2 * 2.)
    verify(sum(td.childTable[k]["dummy"] for k in tl.slice("*", 3, 1)) == 0)
    verify(sum(td.childTable[k]["dummy"] for k in tl.slice("*", 3, 3)) == 0)
    assert tl._archived_slicings


# make a simple schema
tdf = ticdat.TicDatFactory(p1=[["id"], []],
                           p2=[["id"], []],
                           childTable=[["p1_1", "p1_2", "p2"], ["dummy"]])
tdf.set_default_value("childTable", "dummy", 2.)

smallTd = tdf.TicDat()
smallChk = populateTd(smallTd, 30, 20)
smallSmartTupleList = tuplelist(smallTd.childTable)
smallDumbTupleList = DumbTupleList(smallTd.childTable)
smallSlicer = ticdat.Slicer(smallTd.childTable)
smallChildDf = tdf.copy_to_pandas(smallTd, ["childTable"]).childTable
checkChildDfLen(smallChildDf, *smallChk)
checkTupleListLen(smallSmartTupleList, *smallChk)
checkTupleListLen(smallDumbTupleList, *smallChk)
checkSlicerLen(smallSlicer, *smallChk)
checkChildDfSum(smallChildDf, *smallChk)
checkTupleListSum(smallSmartTupleList, smallTd, *smallChk)
예제 #5
0
def create_duplicate_focused_tdf(tdf):
    primary_key_fields = {k: v for k, v in tdf.primary_key_fields.items() if v}
    if primary_key_fields:
        return ticdat.TicDatFactory(
            **{k: [[], v]
               for k, v in primary_key_fields.items()})
예제 #6
0
def find_denormalized_sub_table_failures(table, pk_fields, data_fields):
    """
    checks to see if the table argument contains a denormalized sub-table
    indexed by pk_fields with data fields data_fields
    :param table: The table to study. Can either be a pandas DataFrame or a
                  or a container of consistent {field_name:value} dictionaries.
    :param pk_fields: The pk_fields of the sub-table. Needs to be fields
                      (but not necc primary key fields) of the table.
    :param data_fileds: The data fields of the sub-table. Needs to be fields
                        (but not necc data fields) of the table.
    :return: A dictionary indexed by the pk_fields values in the table
             that are associated with improperly denormalized table rows. The
             values of the return dictionary are themselves dictionaries indexed
             by data fields. The values of the inner dictionary are
             tuples of the different distinct values found for the data field
             at the different rows with common primary key field values.
             The inner dictionaries are pruned so that only tuples of length >1
             are included, and the return dictionary is pruned so that only
             entries with at least one non-pruned inner dictionary is included.
             Thus, a table that has a properly denormalized (pk_fields, data_fields)
             sub-table will return an empty dictionary.
    """
    pk_fields = (pk_fields, ) if stringish(pk_fields) else pk_fields
    data_fields = (data_fields, ) if stringish(data_fields) else data_fields
    verify(
        containerish(pk_fields) and all(map(stringish, pk_fields)),
        "pk_fields needs to be either a field name or a container of field names"
    )
    verify(
        containerish(data_fields) and all(map(stringish, data_fields)),
        "data_fields needs to be either a field name or a container of field names"
    )
    verify(
        len(set(pk_fields).union(data_fields)) == len(pk_fields) +
        len(data_fields),
        "there are duplicate field names amongst pk_fields, data_fields")
    if DataFrame and isinstance(table, DataFrame):
        verify(hasattr(table, "columns"), "table missing columns")
        for f in tuple(pk_fields) + tuple(data_fields):
            verify(f in table.columns, "%s isn't a column for table" % f)
        tdf = ticdat.TicDatFactory(
            t=[[], tuple(pk_fields) + tuple(data_fields)])
        dat = tdf.TicDat(t=table)
        return find_denormalized_sub_table_failures(dat.t, pk_fields,
                                                    data_fields)
    verify(
        containerish(table) and all(map(dictish, table)),
        "table needs to either be a pandas.DataFrame or a container of {field_name:value} dictionaries"
    )
    rtn = defaultdict(lambda: defaultdict(set))
    for row in table:
        for f in tuple(pk_fields) + tuple(data_fields):
            verify(
                f in row,
                "%s isn't a key for one of the inner dictionaries of table" %
                f)
            verify(hasattr(row[f], "__hash__"),
                   "the values for field %s all need to be hashable" % f)
        pk = row[pk_fields[0]] if len(pk_fields) == 1 else tuple(
            row[f] for f in pk_fields)
        for f in data_fields:
            rtn[pk][f].add(row[f])
    for k, v in list(rtn.items()):
        rtn[k] = {f: tuple(s) for f, s in v.items() if len(s) > 1}
        if not rtn[k]:
            del (rtn[k])
    return dict(rtn)