예제 #1
0
def get_dataset_md5(dataset,
                    use_cache=False,
                    debug=True,
                    location=temp_file_location):
    """
    Parameters
    ----------
    dataset : dataset script object
    use_cache : True to use cached data or False to download again
    debug: True to raise error or False to fail silently
    location: path where temporary files are to be created for finding md5

    Returns
    -------
    str : The md5 value of a particular dataset.

    Example
    -------
    >>> for dataset in reload_scripts():
    ...     if dataset.name=='aquatic-animal-excretion':
    ...         print(get_dataset_md5(dataset))
    ...
    683c8adfe780607ac31f58926cf1d326
    """
    try:
        db_name = '{}_sqlite.db'.format(dataset.name.replace('-', '_'))
        workdir = mkdtemp(dir=location)
        os.chdir(workdir)
        engine = sqlite_engine.__new__(sqlite_engine.__class__)
        engine.script_table_registry = {}
        args = {
            'command': 'install',
            'dataset': dataset,
            'file': os.path.join(workdir, db_name),
            'table_name': '{db}_{table}',
            'data_dir': '.'
        }
        engine.opts = args
        engine.use_cache = use_cache
        dataset.download(engine=engine, debug=debug)
        engine.to_csv(sort=False)
        engine.final_cleanup()
        os.remove(os.path.join(workdir, db_name))
        current_md5 = getmd5(os.path.join(file_location, workdir),
                             data_type='dir',
                             encoding=dataset.encoding)
        if not os.path.exists(
                os.path.join(file_location, 'current', dataset.name)):
            os.makedirs(os.path.join(file_location, 'current', dataset.name))
        for file in os.listdir(workdir):
            move(os.path.join(workdir, file),
                 os.path.join(file_location, 'current', dataset.name))
    finally:
        if os.path.isfile(db_name):
            os.remove(db_name)
        if os.path.exists(os.path.join(HOME_DIR, 'raw_data', dataset.name)):
            rmtree(os.path.join(HOME_DIR, 'raw_data', dataset.name))
        os.chdir(os.path.dirname(file_location))
        rmtree(workdir)
    return current_md5
def test_status_dashboard():
    if not os.path.exists(os.path.join(file_location, 'test_dir')):
        os.makedirs(os.path.join(file_location, 'test_dir'))
        create_dirs(os.path.join(test_files_location))
        os.makedirs(
            os.path.join(file_location, 'test_dir', 'old', 'sample-dataset'))
    os.chdir(os.path.join(test_files_location, 'old', 'sample-dataset'))
    script_module = get_script_module('sample_dataset')
    sqlite_engine.opts = {
        'install': 'sqlite',
        'file': 'test_db.sqlite3',
        'table_name': '{db}_{table}',
        'data_dir': '.'
    }
    sqlite_engine.use_cache = False
    script_module.download(engine=sqlite_engine)
    script_module.engine.final_cleanup()
    script_module.engine.to_csv()
    os.remove('test_db.sqlite3')

    # Finding the md5 of the modified dataset
    setattr(script_module.tables['main'], 'path', modified_dataset_path)
    workdir = mkdtemp(dir=test_files_location)
    os.chdir(workdir)
    sqlite_engine.use_cache = False
    script_module.download(engine=sqlite_engine)
    script_module.engine.final_cleanup()
    script_module.engine.to_csv()
    os.remove('test_db.sqlite3')
    calculated_md5 = getmd5(os.getcwd(), data_type='dir')
    rmtree(workdir)

    # If md5 of current dataset doesn't match with current
    # md5 we have to find the diff
    if calculated_md5 != precalculated_md5:
        os.chdir(os.path.join(test_files_location, 'current'))
        sqlite_engine.opts = {
            'install': 'sqlite',
            'file': 'test_db_new.sqlite3',
            'table_name': '{db}_{table}',
            'data_dir': '.'
        }
        sqlite_engine.use_cache = False
        script_module.download(sqlite_engine)
        script_module.engine.final_cleanup()
        script_module.engine.to_csv()
        os.remove('test_db_new.sqlite3')
        diff_generator(script_module, location=test_files_location)

    diff_exist = True if os.path.isfile(
        os.path.join(test_files_location, 'diffs',
                     'sample_dataset_main.html')) else False
    csv_exist = True if os.path.isfile(
        os.path.join(test_files_location, 'old', 'sample-dataset',
                     'sample_dataset_main.csv')) else False
    os.chdir(file_location)
    rmtree(test_files_location)
    assert diff_exist == True
    assert csv_exist == True
def get_dataset_md5(dataset,
                    use_cache=False,
                    debug=True,
                    location=temp_file_location):
    """Get the md5 value of a particular dataset

    dataset script object
    use_cache, True to use cached data or False to download again
    debug,True to raise error or False to fail silently
    location, path where temporary files are to be created for finding md5"""
    try:
        db_name = '{}_sqlite.db'.format(dataset.name.replace('-', '_'))
        workdir = mkdtemp(dir=location)
        os.chdir(workdir)
        engine = sqlite_engine.__new__(sqlite_engine.__class__)
        engine.script_table_registry = {}
        args = {
            'command': 'install',
            'dataset': dataset,
            'file': os.path.join(workdir, db_name),
            'table_name': '{db}_{table}',
            'data_dir': '.'
        }
        engine.opts = args
        engine.use_cache = use_cache
        dataset.download(engine=engine, debug=debug)
        engine.to_csv(sort=False)
        engine.final_cleanup()
        try:
            os.remove(join_path([workdir, db_name]))
        except OSError as error:
            print("There was an error.", error)
        current_md5 = getmd5(workdir,
                             data_type='dir',
                             encoding=dataset.encoding)

        ds = os.path.join(file_location, 'current', dataset.name)
        try:
            if os.path.exists(ds):
                rmtree(ds)
        except OSError as error:
            print(error)
        os.makedirs(ds)
        for file in os.listdir(workdir):
            try:
                move(os.path.join(workdir, file), ds)
            except OSError as error:
                print(error)
    finally:
        if os.path.isfile(db_name):  # delete database file
            os.remove(db_name)
        delete_raw_data(dataset)  # delete raw data

        os.chdir(os.path.dirname(file_location))
        rmtree(workdir)  # delete temp directory
    return current_md5
예제 #4
0
def test_download_regression(dataset, expected):
    """Test download regression."""
    os.chdir(retriever_root_dir)
    base_path = 'test_raw_data'
    path = os.path.join(base_path, dataset)
    data_dir = "test_temp"
    data = os.path.normpath(os.path.join(retriever_root_dir, path, data_dir))

    rt.download(dataset, path=path)
    current_md5 = getmd5(data=path, data_type='dir')
    assert current_md5 == expected
    shutil.rmtree(base_path)

    # download using path and sub_dir
    os.chdir(retriever_root_dir)
    rt.download(dataset, path=path, sub_dir=data_dir)
    current_md5 = getmd5(data=data, data_type='dir')
    assert current_md5 == expected
    shutil.rmtree(base_path)
예제 #5
0
def test_download_regression(dataset, expected):
    """Test download regression."""
    os.chdir(retriever_root_dir)
    base_path = 'test_raw_data'
    path = os.path.join(base_path, dataset)
    data_dir = "test_temp"
    data = os.path.normpath(os.path.join(retriever_root_dir, path, data_dir))

    rt.download(dataset, path=path)
    current_md5 = getmd5(data=path, data_type='dir')
    assert current_md5 == expected
    shutil.rmtree(base_path)

    # download using path and sub_dir
    os.chdir(retriever_root_dir)
    rt.download(dataset, path=path, sub_dir=data_dir)
    current_md5 = getmd5(data=data, data_type='dir')
    assert current_md5 == expected
    shutil.rmtree(base_path)
예제 #6
0
def check_dataset(dataset):
    md5 = None
    status = None
    reason = None
    diff = None
    dataset_detail = None
    previous_md5 = ""

    try:
        dataset_detail = load_dataset_details()
        previous_detail_records = "dataset_details" in dataset_detail and dataset_detail[
            "dataset_details"]
        dataset_has_record = dataset.name in dataset_detail['dataset_details']
        if previous_detail_records and dataset_has_record:
            previous_md5 = dataset_detail['dataset_details'][
                dataset.name]['md5']

        if dataset_type(dataset) == 'spatial':
            install_postgres(dataset)
            dir_path = DATASET_DATA_FOLDER.format(dataset_name=dataset.name)
            md5 = getmd5(dir_path, data_type='dir')
            if not dataset_has_record or md5 != previous_md5:
                diff = diff_generator_spatial(dataset)
            else:
                remove_old_diff(dataset)
            data_shift(dataset, is_spatial=True)
        else:
            md5 = get_dataset_md5(dataset)
            if not dataset_has_record or md5 != previous_md5:
                diff = diff_generator(dataset)
            else:
                remove_old_diff(dataset)
            data_shift(dataset)
        status = True
    except Exception as e:
        reason = str(e)
        status = False
    finally:
        json_file_details = dataset_detail
        json_file_details["dataset_details"][dataset.name] = {
            "md5": md5,
            "status": status,
            "reason": reason,
            "diff": diff
        }
        json_file_details["last_checked_on"] = datetime.now(
            timezone.utc).strftime("%d %b %Y")
        dataset_details_write = open(DATASET_DETAIL_JSON, 'w')
        json.dump(json_file_details,
                  dataset_details_write,
                  sort_keys=True,
                  indent=4)
        dataset_details_write.close()
        delete_raw_data(dataset)
예제 #7
0
def get_csv_md5(dataset, engine, tmpdir, install_function, config, cols=None):
    workdir = tmpdir.mkdtemp()
    workdir.chdir()
    final_direct = os.getcwd()
    engine.script_table_registry = {}
    engine_obj = install_function(dataset.replace('_', '-'), **config)
    time.sleep(5)
    engine_obj.to_csv(select_columns=cols)
    current_md5 = getmd5(data=final_direct, data_type='dir')
    os.chdir(retriever_root_dir)
    return current_md5
예제 #8
0
def commit_info_for_commit(dataset, commit_message):
    """
    Generate info for a particular commit.
    """
    info = {
        "packages": package_details(),
        "time": datetime.now(timezone.utc).strftime("%m/%d/%Y, %H:%M:%S"),
        "version": dataset.version,
        "commit_message": commit_message,
        "script_name": os.path.basename(dataset._file),
    }
    path_to_raw_data = os.path.join(HOME_DIR, "raw_data", dataset.name)
    if os.path.exists(path_to_raw_data):
        info["md5_dataset"] = getmd5(path_to_raw_data,
                                     "dir",
                                     encoding=ENCODING)
    info["md5_script"] = getmd5(dataset._file,
                                data_type="file",
                                encoding=ENCODING)
    return info
예제 #9
0
def get_csv_md5(dataset, engine, tmpdir, install_function, config):
    workdir = tmpdir.mkdtemp()
    os.system("cp -r {} {}/".format(os.path.join(retriever_root_dir, 'scripts'), os.path.join(str(workdir), 'scripts')))
    workdir.chdir()
    script_module = get_script_module(dataset)
    install_function(dataset.replace("_", "-"), **config)
    engine_obj = script_module.checkengine(engine)
    engine_obj.to_csv()
    os.system("rm -r scripts") # need to remove scripts before checking md5 on dir
    current_md5 = getmd5(data=str(workdir), data_type='dir')
    return current_md5
예제 #10
0
def get_csv_md5(dataset, engine, tmpdir, install_function, config, cols=None):
    workdir = tmpdir.mkdtemp()
    workdir.chdir()
    final_direct = os.getcwd()
    engine.script_table_registry = {}
    engine_obj = install_function(dataset.replace('_', '-'), **config)
    time.sleep(5)
    engine_obj.to_csv(select_columns=cols)
    current_md5 = getmd5(data=final_direct, data_type='dir')
    os.chdir(retriever_root_dir)
    return current_md5
예제 #11
0
def check_dataset(dataset):
    os.chdir(os.path.join(file_location))
    md5 = None
    status = None
    reason = None
    diff = None
    try:
        try:
            dataset_detail = json.load(open('dataset_details.json', 'r'))
        except FileNotFoundError:
            with open("dataset_details.json", 'w') as json_file:
                dataset_detail = dict()
                json.dump(dataset_detail, json_file)

        if dataset_type(dataset) == 'spatial':
            workdir = None
            try:
                workdir = mkdtemp(dir=file_location)
                download(dataset.name, path=workdir)
                md5 = getmd5(workdir, data_type='dir')
            except Exception:
                raise
            finally:
                if workdir:
                    rmtree(workdir)
        else:
            md5 = get_dataset_md5(dataset)
            if dataset.name not in dataset_detail \
                    or md5 != dataset_detail[dataset.name]['md5']:
                diff = diff_generator(dataset)
        status = True
    except Exception as e:
        reason = str(e)
        status = False
    finally:
        os.chdir(os.path.join(file_location))
        with FileLock('dataset_details.json.lock'):
            dataset_details_read = open('dataset_details.json', 'r')
            json_file_details = json.load(dataset_details_read)
            json_file_details[dataset.name] = {
                "md5": md5,
                "status": status,
                "reason": reason,
                "diff": diff
            }
            dataset_details_write = open('dataset_details.json', 'w')
            json.dump(json_file_details,
                      dataset_details_write,
                      sort_keys=True,
                      indent=4)
예제 #12
0
def get_csv_md5(dataset, engine, tmpdir, install_function, config):
    workdir = tmpdir.mkdtemp()
    os.system("cp -r {} {}/".format(
        os.path.join(retriever_root_dir, 'scripts'),
        os.path.join(str(workdir), 'scripts')))
    workdir.chdir()
    script_module = get_script_module(dataset)
    install_function(dataset.replace("_", "-"), **config)
    engine_obj = script_module.checkengine(engine)
    engine_obj.to_csv()
    os.system(
        "rm -r scripts")  # need to remove scripts before checking md5 on dir
    current_md5 = getmd5(data=str(workdir), data_type='dir')
    return current_md5
예제 #13
0
def test_commit_installation(zip_file_name, expected_md5):
    """Installs the committed dataset in zip to sqlite and then converts
    it to csv to calculate md5 to compare it with the expected_md5"""
    db_name = 'test_sqlite.db'
    zip_file_path = os.path.join(file_location, "raw_data/dataset-provenance/", zip_file_name)
    engine = install_sqlite(zip_file_path, file=db_name, force=True)
    workdir = mkdtemp(dir=file_location)
    os.chdir(workdir)
    engine.to_csv()
    os.chdir(file_location)
    if os.path.isfile(db_name):
        os.remove(db_name)
    calculated_md5 = getmd5(workdir, data_type='dir', encoding=ENCODING)
    rmtree(workdir)
    assert calculated_md5 == expected_md5
예제 #14
0
def get_csv_md5(dataset, engine, tmpdir, install_function, config):
    workdir = tmpdir.mkdtemp()
    src = os.path.join(retriever_root_dir, 'scripts')
    dest = os.path.join(str(workdir), 'scripts')
    subprocess.call(['cp', '-r', src, dest])
    workdir.chdir()
    final_direct = os.getcwd()
    engine.script_table_registry = {}
    engine_obj = install_function(dataset.replace('_', '-'), **config)
    engine_obj.to_csv()
    # need to remove scripts before checking md5 on dir
    subprocess.call(['rm', '-r', 'scripts'])
    current_md5 = getmd5(data=final_direct, data_type='dir')
    os.chdir(retriever_root_dir)
    return current_md5
예제 #15
0
def get_csv_md5(dataset, engine, tmpdir, install_function, config, cols=None):
    workdir = tmpdir.mkdtemp()
    src = os.path.join(retriever_root_dir, 'scripts')
    dest = os.path.join(str(workdir), 'scripts')
    subprocess.call(['cp', '-r', src, dest])
    workdir.chdir()
    final_direct = os.getcwd()
    engine.script_table_registry = {}
    engine_obj = install_function(dataset.replace('_', '-'), **config)
    time.sleep(5)
    engine_obj.to_csv(select_columns=cols)
    # need to remove scripts before checking md5 on dir
    subprocess.call(['rm', '-r', 'scripts'])
    current_md5 = getmd5(data=final_direct, data_type='dir')
    os.chdir(retriever_root_dir)
    return current_md5
예제 #16
0
def check_dataset(dataset):
    md5 = None
    status = None
    reason = None
    diff = None
    dataset_detail = None
    try:
        try:
            with open(os.path.join(file_location, "dataset_details.json"), 'r') as json_file:
                dataset_detail = json.load(json_file)
        except (OSError, JSONDecodeError):
            dataset_detail = dict()
            dataset_detail['dataset_details'] = {}

        if dataset_type(dataset) == 'spatial':
            workdir = None
            try:
                workdir = mkdtemp(dir=file_location)
                download(dataset.name, path=workdir)
                md5 = getmd5(workdir, data_type='dir')
            except Exception:
                raise
            finally:
                if workdir:
                    rmtree(workdir)
        else:
            md5 = get_dataset_md5(dataset)
            if dataset.name not in dataset_detail \
                    or md5 != dataset_detail[dataset.name]['md5']:
                diff = diff_generator(dataset)
        status = True
    except Exception as e:
        reason = str(e)
        status = False
    finally:
        json_file_details = dataset_detail
        json_file_details["dataset_details"][dataset.name] = {
            "md5": md5,
            "status": status,
            "reason": reason,
            "diff": diff}
        json_file_details["last_checked_on"] = datetime.now(timezone.utc).strftime("%d %b %Y")
        dataset_details_write = open(os.path.join(file_location, 'dataset_details.json'), 'w')
        json.dump(json_file_details, dataset_details_write,
                  sort_keys=True, indent=4)
        dataset_details_write.close()
예제 #17
0
def test_getmd5_path():
    """Test md5 sum calculation given a path to data source."""
    data_file = create_file(['a,b,c', '1,2,3', '4,5,6'])
    exp_hash = '0bec5bf6f93c547bc9c6774acaf85e1a'
    assert getmd5(data=data_file, data_type='file') == exp_hash
예제 #18
0
def test_getmd5_line_end():
    """Test md5 sum calculation given a line with end of line character."""
    lines_end = ['a,b,c\n', '1,2,3\n', '4,5,6\n']
    exp_hash = '0bec5bf6f93c547bc9c6774acaf85e1a'
    assert getmd5(data=lines_end, data_type='lines') == exp_hash
예제 #19
0
def test_getmd5_lines():
    """Test md5 sum calculation given a line."""
    lines = ['a,b,c', '1,2,3', '4,5,6']
    exp_hash = 'ca471abda3ebd4ae8ce1b0814b8f470c'
    assert getmd5(data=lines, data_type='lines') == exp_hash
예제 #20
0
def test_getmd5_path():
    """Test md5 sum calculation given a path to data source."""
    data_file = create_file(['a,b,c', '1,2,3', '4,5,6'])
    exp_hash = '0bec5bf6f93c547bc9c6774acaf85e1a'
    assert getmd5(data=data_file, data_type='file') == exp_hash
예제 #21
0
def test_getmd5_line_end():
    """Test md5 sum calculation given a line with end of line character."""
    lines_end = ['a,b,c\n', '1,2,3\n', '4,5,6\n']
    exp_hash = '0bec5bf6f93c547bc9c6774acaf85e1a'
    assert getmd5(data=lines_end, data_type='lines') == exp_hash
예제 #22
0
def test_getmd5_lines():
    """Test md5 sum calculation given a line."""
    lines = ['a,b,c', '1,2,3', '4,5,6']
    exp_hash = 'ca471abda3ebd4ae8ce1b0814b8f470c'
    assert getmd5(data=lines, data_type='lines') == exp_hash
예제 #23
0
def test_download_regression(dataset, expected):
    """Test download regression."""
    os.chdir(retriever_root_dir)
    download(dataset, "raw_data/{0}".format(dataset))
    current_md5 = getmd5(data="raw_data/{0}".format(dataset), data_type='dir')
    assert current_md5 == expected
예제 #24
0
def check_dataset(dataset):
    md5 = None
    status = None
    reason = None
    diff = None
    dataset_detail = None
    try:
        try:
            with open(os.path.join(file_location, "dataset_details.json"),
                      'r') as json_file:
                dataset_detail = json.load(json_file)
        except (OSError, JSONDecodeError):
            dataset_detail = dict()
            dataset_detail['dataset_details'] = {}

        if dataset_type(dataset) == 'spatial':
            install_postgres(dataset)
            md5 = getmd5(path.join(file_location, 'current', dataset.name),
                         data_type='dir')

            if dataset.name not in dataset_detail[
                    'dataset_details'] or md5 != dataset_detail[
                        'dataset_details'][dataset.name]['md5']:
                diff = diff_generator_spatial(dataset)
            else:
                for keys in dataset.tables:
                    file_name = '{}.{}'.format(dataset.name.replace('-', '_'),
                                               keys)
                    html_file_name = '{}.html'.format(file_name)
                    if os.path.exists(
                            os.path.join(file_location, 'diffs',
                                         html_file_name)):
                        remove(
                            os.path.join(file_location, 'diffs',
                                         html_file_name))
            data_shift(dataset, is_spatial=True)

        else:
            md5 = get_dataset_md5(dataset)

            if dataset.name not in dataset_detail[
                    'dataset_details'] or md5 != dataset_detail[
                        'dataset_details'][dataset.name]['md5']:
                diff = diff_generator(dataset)
            else:
                for keys in dataset.tables:
                    file_name = '{}_{}'.format(dataset.name.replace('-', '_'),
                                               keys)
                    html_file_name = '{}.html'.format(file_name)
                    if os.path.exists(
                            os.path.join(file_location, 'diffs',
                                         html_file_name)):
                        remove(
                            os.path.join(file_location, 'diffs',
                                         html_file_name))
            data_shift(dataset)
        status = True
    except Exception as e:
        reason = str(e)
        status = False
    finally:
        json_file_details = dataset_detail
        json_file_details["dataset_details"][dataset.name] = {
            "md5": md5,
            "status": status,
            "reason": reason,
            "diff": diff
        }
        json_file_details["last_checked_on"] = datetime.now(
            timezone.utc).strftime("%d %b %Y")
        dataset_details_write = open(
            os.path.join(file_location, 'dataset_details.json'), 'w')
        json.dump(json_file_details,
                  dataset_details_write,
                  sort_keys=True,
                  indent=4)
        dataset_details_write.close()
        if os.path.exists(os.path.join(HOME_DIR, 'raw_data', dataset.name)):
            rmtree(os.path.join(HOME_DIR, 'raw_data', dataset.name))
예제 #25
0
def test_download_regression(dataset, expected):
    """Test download regression."""
    os.chdir(retriever_root_dir)
    download(dataset, "raw_data/{0}".format(dataset))
    current_md5 = getmd5(data="raw_data/{0}".format(dataset), data_type='dir')
    assert current_md5 == expected