Python ManifestReader示例，housinginsights.ingestion.ManifestReader Python示例

示例#1

0

显示文件

文件： test_ingestion.py 项目： speedyturkey/housing-insights

    def test_file_checking(self):
        '''
        Various quality assurance checks on the DataReader's validation functions
        - include_flag in manifest used properly to determine loading or skipping
        - CSV file being loaded has the expected number of fields as found in meta.json
        - convenience function should_file_be_loaded performs properly
        '''
        META_FILENAME = os.path.join(os.path.dirname(__file__),
                                     'test_data/meta_sample.json')
        meta = load_meta_data(META_FILENAME)
        table_name = 'buildings'

        MANIFEST_FILENAME = os.path.join(os.path.dirname(__file__),
                                         'test_data/manifest_sample.csv')
        manifest = ManifestReader(MANIFEST_FILENAME)

        for manifest_row in manifest:
            print(manifest_row)
            csv_reader = DataReader(meta=meta, manifest_row=manifest_row)
            assert (csv_reader.do_fields_match())
            sql_manifest_row = manifest_row  #assume they exactly match
            if manifest_row['include_flag'] == "skip":
                self.assertFalse(
                    csv_reader.check_include_flag(sql_manifest_row))
                self.assertFalse(
                    csv_reader.should_file_be_loaded(sql_manifest_row))
            elif manifest_row['include_flag'] == "use":
                self.assertTrue(
                    csv_reader.check_include_flag(sql_manifest_row))
                self.assertTrue(
                    csv_reader.should_file_be_loaded(sql_manifest_row))

示例#2

0

显示文件

文件： test_ingestion.py 项目： wassona/housing-insights

    def test_file_checking(self):
        '''
        Various quality assurance checks on the DataReader's validation functions
        - include_flag in manifest used properly to determine loading or skipping
        - CSV file being loaded has the expected number of fields as found in meta.json
        - convenience function should_file_be_loaded performs properly
        '''
        META_FILENAME = os.path.join(os.path.dirname(__file__), 'test_data/meta_sample.json')
        meta = load_meta_data(META_FILENAME)
        table_name = 'buildings'

        MANIFEST_FILENAME = os.path.join(os.path.dirname(__file__), 'test_data/manifest_sample.csv')
        manifest = ManifestReader(MANIFEST_FILENAME)

        for manifest_row in manifest:
            print(manifest_row)
            csv_reader = DataReader(meta = meta, manifest_row=manifest_row)
            assert(csv_reader._do_fields_match())
            
             #assume they exactly match for the purpose of this test, expect as modified below
            sql_manifest_row = manifest_row
            #sql_manifest_row has an extra field to indicate whether it has been successfully loaded in previous runs or not
            sql_manifest_row['status'] = 'failed'

            if manifest_row['include_flag'] == "skip":
                self.assertFalse(csv_reader._check_include_flag(sql_manifest_row))
                self.assertFalse(csv_reader.should_file_be_loaded(sql_manifest_row))
            elif manifest_row['include_flag'] == "use":
                self.assertTrue(csv_reader._check_include_flag(sql_manifest_row))
                self.assertTrue(csv_reader.should_file_be_loaded(sql_manifest_row))

            #If the file is already in the SQL manifest, skip it when manifest says 'use' (i.e. it's already loaded)
            sql_manifest_row['status'] = 'loaded'
            if manifest_row['include_flag'] == "use":
                self.assertFalse(csv_reader._check_include_flag(sql_manifest_row))

示例#3

0

显示文件

def make_all_json(manifest_path):

    completed_tables = {}
    manifest = ManifestReader(path=manifest_path)
    if manifest.has_unique_ids():
        for manifest_row in manifest:
            tablename = manifest_row['destination_table']
            if tablename not in completed_tables:
                if manifest_row['include_flag'] == 'use':
                    filepath = os.path.abspath(
                        os.path.join(manifest_row['local_folder'],
                                     manifest_row['filepath']))
                    make_draft_json(filepath, tablename)
                    completed_tables[tablename] = filepath
                    print("completed {}".format(tablename))
            else:
                print("skipping {}, already loaded".format(
                    manifest_row['unique_data_id']))

    print("Finished! Used these files to generate json:")
    print(completed_tables)

示例#4

0

显示文件

文件： test_ingestion.py 项目： wassona/housing-insights

    def test_unique_manifest_ids(self):
        '''Make sure the 'unique_data_id' column in the manifest is unique to each row'''
        #Example with all unique values
        MANIFEST_FILENAME = os.path.join(os.path.dirname(__file__), 'test_data/manifest_unique.csv')
        manifest1 = ManifestReader(MANIFEST_FILENAME)
        self.assertTrue(manifest1.has_unique_ids())

        #Example with a duplicated name
        MANIFEST_FILENAME = os.path.join(os.path.dirname(__file__), 'test_data/manifest_duplicates.csv')
        manifest2 = ManifestReader(MANIFEST_FILENAME)
        self.assertFalse(manifest2.has_unique_ids())

示例#5

0

显示文件

def main(database_choice, meta_path, manifest_path, keep_temp_files=True):
    """
    Main routine, calls all the other ones in this file as needed.

    Big picture:
        - Use manifest.csv to find out all the files we want to load
        - Compare current manifest.csv to the sql database manifest table, which
         reflects manifest.csv as of the last time this script was run, and
         tells us whether or not the file has already been loaded in this
         database.
        - Use meta.json to make sure the CSV file has all the fields we expect
        it to, and to decide the data type of each field.
        - Load the csv into the database

    If there is an error loading a file (either flagged error from fields not
    matching, or parsing error if data type isn't right):
    - skip loading this file,
    - report the error to SQL using update_sql_manifest(status="error")
    - use logging.warning() to write the specific error encountered to the
    log file
    - at the end of loading print an error message to the console
    """

    # load given meta.json and manifest.csv files into memory
    meta = ingestionfunctions.load_meta_data(meta_path)
    manifest = ManifestReader(manifest_path)

    # load given database choice and check/create sql manifest
    engine = dbtools.get_database_engine(database_choice)
    sql_manifest_exists = \
        ingestionfunctions.check_or_create_sql_manifest(engine=engine)
    logging.info("sql_manifest_exists: {}".format(sql_manifest_exists))

    #TODO should this be moved into the __init__ of ManifestReader? Do we ever want to use ManifestReader if it has duplicate rows?
    if not manifest.has_unique_ids():
        raise ValueError('Manifest has duplicate unique_data_id!')

    # Iterate through each row in the manifest then clean and validate. If it
    # passes validation, then write to temp psv file for loading into database.
    for manifest_row in manifest:
        #Incompletely filled out rows in the manifest can break the other code
        # TODO: figure out a way to flag this issue early in loading of manifest

        # only clean and validate data files flagged for use in database
        if manifest_row['include_flag'] == 'use':
            logging.info(
                "{}: preparing to load row {} from the manifest".format(
                    manifest_row['unique_data_id'], len(manifest)))

            temp_filepath = os.path.abspath(
                os.path.join(
                    logging_path,
                    'temp_{}.psv'.format(manifest_row['unique_data_id'])))
            # prepare csv reader and writer objects
            csv_reader = DataReader(meta=meta,
                                    manifest_row=manifest_row,
                                    load_from="file")
            csv_writer = CSVWriter(meta=meta,
                                   manifest_row=manifest_row,
                                   filename=temp_filepath)

            # prepare objects for interfacing with database and then get
            # the equivalent manifest row from the database
            sql_interface = HISql(meta=meta,
                                  manifest_row=manifest_row,
                                  engine=engine,
                                  filename=temp_filepath)
            sql_manifest_row = sql_interface.get_sql_manifest_row()

            #Assign an appropriate testing cleaner
            tablename = manifest_row['destination_table']
            cleaner_class_name = meta[tablename]['cleaner']
            cleaner = ingestionfunctions.get_cleaner_from_name(
                meta=meta, manifest_row=manifest_row, name=cleaner_class_name)

            # Identify fields that exist in meta.json but not CSV
            # so we can add them to the row as it is cleaned and loaded.
            meta_only_fields = {}
            for field in meta[tablename]['fields']:
                if field['source_name'] not in csv_reader.keys:
                    # adds 'sql_name',None as key,value pairs in dict
                    meta_only_fields[field['sql_name']] = None

            #clean the file and save the output to a local pipe-delimited file
            if csv_reader.should_file_be_loaded(
                    sql_manifest_row=sql_manifest_row):
                print("  Cleaning...")
                for idx, data_row in enumerate(csv_reader):
                    data_row.update(
                        meta_only_fields)  # insert other field dict
                    clean_data_row = cleaner.clean(data_row, idx)
                    if clean_data_row != None:
                        csv_writer.write(clean_data_row)

                csv_writer.close()
                print("  Loading...")

                #Decide whether to append or replace the table
                if meta[tablename]["replace_table"] == True:
                    logging.info("  replacing existing table")
                    sql_interface.drop_table()

                #Appends to table; if dropped, it recreates
                sql_interface.create_table_if_necessary()
                try:
                    sql_interface.write_file_to_sql()
                except TableWritingError:
                    #TODO tell user total count of errors.
                    #currently write_file_to_sql() just writes in log that file failed
                    pass
                if keep_temp_files == False:
                    csv_writer.remove_file()

示例#6

0

显示文件

# SETUP - Mostly taken from load_data.py
this_file = os.path.realpath(__file__)
this_dir = os.path.dirname(this_file)

meta_path = os.path.abspath(
    this_dir +
    '/test_data/meta_sample.json')  # TODO Update when meta.json is updated.
manifest_path = os.path.abspath(
    this_dir +
    '/test_data/manifest_sample.csv')  # TODO Consider updating manifest_sample
database_choice = 'docker_database'
logging_path = os.path.abspath("../logs")

meta = ingestionfunctions.load_meta_data(meta_path)
engine = dbtools.get_database_engine(database_choice)
manifest = ManifestReader(manifest_path)

FAKE_REQUIRED_FIELD = {
    'sql_name': 'foo',
    'type': 'decimal',
    'source_name': 'FAKE_REQUIRED',
    'display_text': 'Blah',
    'display_name': 'x',
    'required_in_source': True
}
FAKE_OPTIONAL_FIELD = {
    'sql_name': 'foo',
    'type': 'decimal',
    'source_name': 'FAKE_OPTIONAL',
    'display_text': 'Blah',
    'display_name': 'x',

示例#7

0

显示文件

文件： load_data.py 项目： speedyturkey/housing-insights

def main(database_choice, meta_path, manifest_path, keep_temp_files = True):
    """
    Main routine, calls all the other ones in this file as needed.

    Big picture:
        - Use manifest.csv to find out all the files we want to load
        - Compare current manifest.csv to the sql database manifest table, which reflects manifest.csv as
          of the last time this script was run, and tells us whether or not the file has already been loaded in this database.
        - Use meta.json to make sure the CSV file has all the fields we expect it to, and to decide the data type of each field
        - Load the csv into the database

    If there is an error loading a file (either flagged error from fields not matching, or parsing error if data type isn't right):
    - skip loading this file,
    - report the error to SQL using update_sql_manifest(status="error")
    - use logging.warning() to write the specific error encountered to the log file
    - at the end of loading print an error message to the console
    """
    meta = ingestionfunctions.load_meta_data(meta_path)
    engine = dbtools.get_database_engine(database_choice)
    manifest = ManifestReader(manifest_path)

    sql_manifest_exists = ingestionfunctions.check_or_create_sql_manifest(engine=engine)
    logging.info("sql_manifest_exists: {}".format(sql_manifest_exists))

    #TODO should this be moved into the __init__ of ManifestReader? Do we ever want to use ManifestReader if it has duplicate rows?
    if not manifest.has_unique_ids():
        raise ValueError('Manifest has duplicate unique_data_id!')

    for manifest_row in manifest:
        #Incompletely filled out rows in the manifest can break the other code
        if manifest_row['include_flag'] == 'use':
            logging.info("{}: preparing to load row {} from the manifest".format(manifest_row['unique_data_id'],len(manifest)))

            temp_filepath = os.path.abspath(
                                os.path.join(
                                    logging_path,
                                    'temp_{}.psv'.format(manifest_row['unique_data_id'])
                                ))
            csv_reader = DataReader(meta = meta, manifest_row=manifest_row, load_from="file")
            csv_writer = CSVWriter(meta = meta, manifest_row = manifest_row, filename = temp_filepath)
            sql_interface = HISql(meta = meta, manifest_row = manifest_row, engine = engine, filename=temp_filepath)
            sql_manifest_row = sql_interface.get_sql_manifest_row()

            #Assign an appropriate testing cleaner
            tablename = manifest_row['destination_table']
            cleaner_class_name = meta[tablename]['cleaner']
            cleaner = ingestionfunctions.get_cleaner_from_name(
                                        meta=meta, 
                                        manifest_row=manifest_row, 
                                        name= cleaner_class_name)

            #clean the file and save the output to a local pipe-delimited file
            if csv_reader.should_file_be_loaded(sql_manifest_row=sql_manifest_row):
                print("  Ready to clean {}".format(csv_reader.destination_table))
                for idx, data_row in enumerate(csv_reader):
                    clean_data_row = cleaner.clean(data_row, idx)
                    if clean_data_row != None:
                        csv_writer.write(clean_data_row)

                csv_writer.close()
                print("  Ready to load")
                
                #Decide whether to append or replace the table
                if meta[tablename]["replace_table"] == True:
                    logging.info("  replacing existing table")
                    sql_interface.drop_table()
                
                #Appends to table; if dropped, it recreates
                sql_interface.create_table()
                try:
                    sql_interface.write_file_to_sql()
                except TableWritingError:
                    #TODO tell user total count of errors. 
                    #currently write_file_to_sql() just writes in log that file failed
                    pass
                if keep_temp_files == False:
                    csv_writer.remove_file()