Пример #1
0
Файл: run.py Проект: tswsxk/XKT
def train(train_fn,
          test_fn,
          reporthook=None,
          final_reporthook=None,
          **cfg_kwargs):  # pragma: no cover
    from longling.ML.toolkit.hyper_search import prepare_hyper_search

    cfg_kwargs, reporthook, final_reporthook, tag = prepare_hyper_search(
        cfg_kwargs,
        Configuration,
        reporthook,
        final_reporthook,
        primary_key="macro_avg:f1")

    _cfg = Configuration(**cfg_kwargs)
    print(_cfg)
    _net = get_net(**_cfg.hyper_params)

    train_data = etl(_cfg.var2val(train_fn), params=_cfg)
    test_data = etl(_cfg.var2val(test_fn), params=_cfg)

    numerical_check(_net,
                    _cfg,
                    train_data,
                    test_data,
                    dump_result=not tag,
                    reporthook=reporthook,
                    final_reporthook=final_reporthook)
Пример #2
0
def train(train_fn, test_fn, vec_files, reporthook=None, final_reporthook=None, **cfg_kwargs):  # pragma: no cover
    from longling import print_time
    from longling.ML.toolkit.hyper_search import prepare_hyper_search

    cfg_kwargs, reporthook, final_reporthook, tag = prepare_hyper_search(
        cfg_kwargs, Configuration, reporthook, final_reporthook, final_key="prf:avg:f1"
    )

    _cfg = Configuration(**cfg_kwargs)
    _cfg.logger.info(_cfg)
    vec_files = parse_vec_files(vec_files)

    with print_time(tips='loading embedding', logger=_cfg.logger):
        embeddings = load_embedding({k: _cfg.var2val(v) for k, v in vec_files.items()}, _cfg.logger)

    train_data = etl(_cfg.var2val(train_fn), embeddings, params=_cfg)
    test_data = etl(_cfg.var2val(test_fn), embeddings, params=_cfg)

    embedding_size = get_embedding_size(embeddings)

    _net = get_net(embedding_size=embedding_size, **_cfg.hyper_params)
    _net.initialize(ctx=_cfg.ctx)
    _net.embedding.set_weight(get_embedding_array(embeddings))

    numerical_check(_net, _cfg, train_data, test_data, dump_result=not tag, reporthook=reporthook,
                    final_reporthook=final_reporthook)
Пример #3
0
def train(train_fn, test_fn, **cfg_kwargs):
    _cfg = Configuration(**cfg_kwargs)
    _net = get_net(**_cfg.hyper_params)

    train_data = etl(_cfg.var2val(train_fn), params=_cfg)
    test_data = etl(_cfg.var2val(test_fn), params=_cfg)

    numerical_check(_net, _cfg, train_data, test_data, dump_result=True)
Пример #4
0
    def test_etl_wwc(cls):
        extract_date = datetime.strptime('2019-03-02',
                                         transform.EXTRACT_DATE_FORMAT).date()
        data_dir = os.path.join(TEST_DIR, etl.DATA_DIR)
        etl.etl('wwc', extract_date, data_dir=data_dir, db=TEST_DB)

        conn = sqlite3.connect(TEST_DB)
        c = conn.cursor()
        c.execute('SELECT count(*) FROM wwc_accounts')
        assert c.fetchone()[0] == 5
Пример #5
0
def main():
    # delete_rds_instance()

    if DB_ON_CLOUD:
        setup_rds_instance()

    create_db()
    create_schema()

    etl()
Пример #6
0
def train(train_fn,
          test_fn,
          reporthook=None,
          final_reporthook=None,
          primary_key="macro_auc",
          params_save=False,
          **cfg_kwargs):  # pragma: no cover
    from longling.ML.toolkit.hyper_search import prepare_hyper_search

    cfg_kwargs, reporthook, final_reporthook, tag = prepare_hyper_search(
        cfg_kwargs,
        Configuration,
        reporthook,
        final_reporthook,
        primary_key=primary_key)

    _cfg = Configuration(**cfg_kwargs)
    print(_cfg)

    train_data, train_df = etl(_cfg.var2val(train_fn), params=_cfg)
    test_data, _ = etl(_cfg.var2val(test_fn), params=_cfg)

    _net = get_net(**_cfg.hyper_params)
    net_init(_net,
             cfg=_cfg,
             **_cfg.init_params,
             int_df=train_df,
             user_num=_cfg.hyper_params["user_num"],
             item_num=_cfg.hyper_params["item_num"],
             logger=_cfg.logger)

    numerical_check(_net,
                    _cfg,
                    train_data,
                    test_data,
                    dump_result=not tag,
                    reporthook=reporthook,
                    final_reporthook=final_reporthook,
                    params_save=params_save)
Пример #7
0
def main():
    # Create new DB and connect to it
    conn = sqlite3.connect("Transcation_DB.db")
    select_cursor = conn.cursor()
    insert_cursor = conn.cursor()

    # Load transactions from csv file to SQLite DB
    transaction_table_name = 'transaction_raw_data'
    transaction_column_types = [
        'text', 'date', 'text', 'int', 'text', 'text', 'text', 'text'
    ]
    transaction_path = 'anon_transactions_sample.csv'
    load_csv_sqlite(transaction_path, transaction_table_name,
                    transaction_column_types, insert_cursor)

    # Load category csv file to SQLite DB
    category_table_name = 'category_raw_data'
    category_column_types = ['text', 'text', 'text', 'text', 'text', 'text']
    category_path = 'categories.csv'
    load_csv_sqlite(category_path, category_table_name, category_column_types,
                    insert_cursor)

    #Create datawarehouse table structure
    create_dw(insert_cursor)

    #Load transaction data to datawarehouse
    transaction_etl(select_cursor, insert_cursor)

    # Load category data to datawarehouse
    sector_extract = ''' SELECT DISTINCT sector_id, sector_name FROM category_raw_data '''
    sector_load = ''' INSERT OR IGNORE INTO sector (sector_id, sector_name) VALUES (?, ?) '''
    etl(select_cursor, insert_cursor, sector_extract, sector_load)

    category_extract = ''' SELECT DISTINCT category_id, category_name, sector_id FROM category_raw_data'''
    category_load = ''' INSERT OR IGNORE INTO category (category_id, category_name, sector_id) VALUES (?, ?, ?) '''
    etl(select_cursor, insert_cursor, category_extract, category_load)

    subcategory_extract = ''' SELECT DISTINCT subcategory_id, subcategory_name, category_id FROM category_raw_data '''
    subcategory_load = ''' INSERT OR IGNORE INTO subcategory (subcategory_id, subcategory_name, category_id) VALUES (?, ?, ?) '''
    etl(select_cursor, insert_cursor, subcategory_extract, subcategory_load)

    conn.commit()
    conn.close()
Пример #8
0
from etl import etl
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D

points = etl()
Kmean = KMeans(n_clusters=2)
Kmean.fit(points)
labels = Kmean.labels_

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

for item, label in zip(points, labels):
    if label == 0:
        ax.scatter(item[0], item[1], item[2], c='r', marker='o')
    else:
        ax.scatter(item[0], item[1], item[2], c='b', marker='s')

ax.set_xlabel('changeOverTime')
ax.set_ylabel('changePercent')
ax.set_zlabel('volume')
plt.show()
Пример #9
0
""" Main Module"""
from config_module import config
from create_tables import main as create_tables
from etl import main as etl
from load_csv_files import main as load_csv_file
from question import main as question
from sql_queries.oltp import create_table_queries, drop_table_queries
"""
 3 thins done in this project:
 1- Creating original database and tables
 2- Loading csv files data to original table
 3- Creating new database and tables which is a fact and multiple dimension tables through sql
 queries for transformation in etl process. Queries can be check in sql_queries.oltp module
"""

if __name__ == '__main__':
    obj = config.resolve()
    create_tables(obj, "OLTP_DB", create_table_queries, drop_table_queries)
    load_csv_file()
    etl()
    question()
Пример #10
0
def updateData():
    """Function to update dataset."""
    print("[INFO] Updating data")
    data = etl()
    data.to_csv('data.csv', index=False)
    print(f"[INFO] Data has been updated on {datetime.now}.")