예제 #1
0
def test_refactor_dataframes():
    df = pd.DataFrame([{
        'name': 'Terry',
        'score': 0.5,
    }, {
        'name': 'Terry',
        'score': 0.8,
    }, {
        'name': 'Owen',
        'score': 0.7,
    }])
    conn = sqlite3.connect(':memory:')
    output = utils.refactor_dataframes(conn, [df],
                                       {'name': ('People', 'first_name')})
    assert 1 == len(output)
    dataframe = output[0]
    # There should be a 'People' table in sqlite
    assert [
        (1, 'Terry'),
        (2, 'Owen'),
    ] == conn.execute('select id, first_name from People').fetchall()
    assert ('   name  score\n'
            '0     1    0.5\n'
            '1     1    0.8\n'
            '2     2    0.7') == str(dataframe)
예제 #2
0
def test_refactor_dataframes():
    df = pd.DataFrame([
        {
            "name": "Terry",
            "score": 0.5
        },
        {
            "name": "Terry",
            "score": 0.8
        },
        {
            "name": "Owen",
            "score": 0.7
        },
    ])
    conn = sqlite3.connect(":memory:")
    output = utils.refactor_dataframes(conn, [df],
                                       {"name":
                                        ("People", "first_name")}, False)
    assert 1 == len(output)
    dataframe = output[0]
    # There should be a 'People' table in sqlite
    assert [(1, "Terry"), (2, "Owen")
            ] == conn.execute("select id, first_name from People").fetchall()
    assert ("   name  score\n"
            "0     1    0.5\n"
            "1     1    0.8\n"
            "2     2    0.7") == str(dataframe)
예제 #3
0
def convert_csv_to_sqlite(csv_filename):
    sqlite_filename = "%s.sqlite" % os.path.splitext(csv_filename)[0]
    conn = sqlite3.connect(sqlite_filename)

    dataframes = []
    csvs = {os.path.splitext(csv_filename)[0]: csv_filename}

    for name, path in csvs.items():
        try:
            df = load_csv(path,
                          separator=",",
                          skip_errors=False,
                          quoting=0,
                          shape=None)
            df.table_name = "Items"
            dataframes.append(df)
        except LoadCsvError as e:
            print("Could not load {}: {}".format(path, e), file=sys.stderr)

    print("Loaded {} dataframes".format(len(dataframes)))
    assert len(dataframes) == 1

    # Use extract_columns to build a column:(table,label) dictionary
    foreign_keys = {}

    column_map = [
        (re.compile(r"Other Title(?:|_\d+)"), "Title"),
        (re.compile(r"Creator/Publisher(?:|_\d+)"), "Creator"),
        (re.compile(r"Subject(?:|_\d+)"), "Subject"),
        (re.compile(r"Collection(?:|_\d+)"), "Collection"),
        (re.compile(r"Era(?:|_\d+)"), "Era"),
        (re.compile(r"Location(?:|_\d+)"), "Location"),
        (re.compile(r"Geography(?:|_\d+)"), "Geography"),
    ]

    for column_name in dataframes[0].columns:
        for regex, mapped in column_map:
            if regex.search(column_name):
                foreign_keys[column_name] = (mapped, "value")

    # Now we have loaded the dataframes, we can refactor them
    refactored = refactor_dataframes(conn, dataframes, foreign_keys, False)

    for df in refactored:
        # This is a bit trickier because we need to
        # create the table with extra SQL for foreign keys
        if table_exists(conn, df.table_name):
            drop_table(conn, df.table_name)

        to_sql_with_foreign_keys(conn, df, df.table_name, foreign_keys)

    conn.close()

    print("Updated", sqlite_filename)
예제 #4
0
def test_refactor_dataframes():
    df = pd.DataFrame([{
        'name': 'Terry',
        'score': 0.5,
    }, {
        'name': 'Terry',
        'score': 0.8,
    }, {
        'name': 'Owen',
        'score': 0.7,
    }])
    output = utils.refactor_dataframes([df],
                                       {'name': ('People', 'first_name')})
    assert 2 == len(output)
    lookup_table, dataframe = output
    assert {1: 'Terry', 2: 'Owen'} == lookup_table.id_to_value
    assert ('   name  score\n'
            '0     1    0.5\n'
            '1     1    0.8\n'
            '2     2    0.7') == str(dataframe)