예제 #1
0
def drop_tables(database_choice):
    """
    used to rebuild the database by first dropping all tables before running main()
    """
    engine = dbtools.get_database_engine(database_choice)
    db_conn = engine.connect()
    query_result = db_conn.execute("DROP SCHEMA public CASCADE;CREATE SCHEMA public;")
예제 #2
0
    def create_project_subsidy_csv(self,
                                   uid,
                                   project_fields_map,
                                   subsidy_fields_map,
                                   database_choice=None):
        """
        Writes 'new_proj_data_file' and 'new_subsidy_data_file' raw files
        from the source csv file. It then deletes the source file so it
        doesn't get added to manifest and loaded into the database.
        """
        if database_choice is None:
            database_choice = 'docker_database'
        engine = dbtools.get_database_engine(database_choice)
        db_conn = engine.connect()

        # create file path objects
        source_csv = self.output_paths[uid]
        folder = os.path.dirname(source_csv)
        new_proj_data_file = os.path.join(folder, "{}_project.csv".format(uid))
        new_subsidy_data_file = os.path.join(folder,
                                             "{}_subsidy.csv".format(uid))

        # create dchousing_project/subsidy files from source
        with open(source_csv, encoding='utf-8') as f, \
                open(new_proj_data_file, mode='w', encoding='utf-8') as proj, \
                open(new_subsidy_data_file, mode='w', encoding='utf-8') as subsidy:

            source_csv_reader = csv.DictReader(f)
            proj_writer = csv.DictWriter(proj, fieldnames=PROJ_FIELDS)
            proj_writer.writeheader()
            subsidy_writer = csv.DictWriter(subsidy, fieldnames=SUBSIDY_FIELDS)
            subsidy_writer.writeheader()

            # If the project doesn't exists in the database, we want to add a
            # new record to the project table.
            #
            # All projects need a new record added to the subsidy table. If that
            # project already exists in the database, link it to the project
            # using the nlihc_id; if not, link it to the record that was added
            # to the proj_writer output file."
            for building in source_csv_reader:
                nlihc_id, in_proj_table = self._get_nlihc_id_from_db(
                    db_conn=db_conn, address_id=building['ADDRESS_ID'])

                if not in_proj_table:
                    data = self._map_data_for_row(
                        nlihc_id=nlihc_id,
                        fields=PROJ_FIELDS,
                        fields_map=project_fields_map,
                        line=building)
                    proj_writer.writerow(data)

                # add all to subsidy table
                data = self._map_data_for_row(nlihc_id=nlihc_id,
                                              fields=SUBSIDY_FIELDS,
                                              fields_map=subsidy_fields_map,
                                              line=building)
                subsidy_writer.writerow(data)
예제 #3
0
    def __init__(self,
                 baseurl,
                 proxies=None,
                 database_choice=None,
                 debug=False):

        if database_choice is None:
            database_choice = 'docker_database'

        super().__init__(baseurl, proxies, database_choice, debug=debug)

        self.engine = dbtools.get_database_engine(database_choice)

        #Get a dict mapping address to mar id from the database and store in memory
        # for use in other methods.
        self.add_mar_addr_lookup()
예제 #4
0
    def __init__(self,
                 database_choice=None,
                 meta_path=None,
                 manifest_path=None,
                 keep_temp_files=True,
                 drop_tables=False):
        """
        Initializes the class with optional arguments. The default behaviour 
        is to load the local database with data tracked from meta.json 
        and manifest.csv within the 'python/scripts' folder.
        
        :param database_choice: choice of 'local_database', 
        'docker_database', and 'remote_database'
        :param meta_path: the path of the meta.json to be used
        :param manifest_path: the path of the manifest_path.csv to be used
        :param keep_temp_files: if True, temp clean pipe-delimited files will be
        archived in the 'python/logs' folder
        """

        # load defaults if no arguments passed
        _scripts_path = os.path.abspath(os.path.join(PYTHON_PATH, 'scripts'))
        if database_choice is None:
            self.database_choice = 'docker_database'
        else:
            self.database_choice = database_choice
        if meta_path is None:
            meta_path = os.path.abspath(
                os.path.join(_scripts_path, 'meta.json'))
        if manifest_path is None:
            manifest_path = os.path.abspath(
                os.path.join(_scripts_path, 'manifest.csv'))
        self._keep_temp_files = keep_temp_files

        # load given meta.json and manifest.csv files into memory
        self.meta = ingestionfunctions.load_meta_data(meta_path)
        self.manifest = Manifest(manifest_path)

        # setup engine for database_choice
        self.engine = dbtools.get_database_engine(self.database_choice)

        # write the meta.json to the database
        self._meta_json_to_database()

        self._failed_table_count = 0

        self.drop_tables = drop_tables
예제 #5
0
    def get_dist(self, unique_data_ids=None, sample=False, output_type='csv',db='local_database'):

        if 'wmata_dist' in unique_data_ids:
            u = 'wmata_dist'
            #Variable to hold data until written to file
            self.distOutput = []
            self.distHeader = ('nlihc_id','type','stop_id_or_station_code','dist_in_miles','crow_distance','building_lat','building_lon','stop_or_station_lat','stop_or_station_lon')


            #First, find which projects we should be calculating from
            try:
                #Configure the connection
                engine = dbtools.get_database_engine(db)
                conn = dbtools.get_database_connection(db)
                logger.info("  Connected to Housing Insights database")
                columnset = conn.execute('select column_name from INFORMATION_SCHEMA.COLUMNS where TABLE_NAME=\'project\'')

                #Get the rows
                proj_query = 'select * from project'
                if sample==True:
                    proj_query = proj_query + " limit 1"
                rows = conn.execute(proj_query)

                conn.close()
                engine.dispose()
            except Exception as e:
                logger.error("I am unable to connect to the database: %s", e)

            columns = []
            for c in columnset:
                columns.append(c)

            numrow = 0
            total_rows = rows.rowcount
            logger.info("Total rows: %s", total_rows)

            #Get the rail stations once (no option to only get closest ones provided by wmata)
            wmata_headers = self._get_wmata_headers()
            railResponse = requests.get("https://api.wmata.com/Rail.svc/json/jStations", headers=wmata_headers)
            self.railStations = railResponse.json()['Stations']

            #for every project, get nearby stations and walking distance
            for idx, row in enumerate(rows):
                radius = self._get_meters(0.5)

                project_details = self._get_project_info(row,columns)
                lat = project_details['lat']
                lon = project_details['lon']

                if lat == None or lon == None:
                    logger.warning("  Lat or Lon not available for project {}".format(project_details['nlihcid']))

                if lat != None and lon != None:
                    logger.info("  Processing project %s of %s", numrow,total_rows)

                    # find all metro stations within 0.5 miles
                    logger.info("  Starting processing rail stations for %s", project_details['nlihcid'])
                    self._find_rail_stations(self.railStations,project_details,radius,sample=sample,db=db)
                    logger.info("  Completed processing rail stations for project id %s", project_details['nlihcid'])

                    # find all bus stops within 0.5 miles
                    logger.info("  Starting processing bus stations for project id %s", project_details['nlihcid'])
                    self._find_bus_stations(project_details, radius,sample=sample,db=db)
                    logger.info("  Completed processing bus stations for project id %s", project_details['nlihcid'])

            #Save the data
            if ( output_type == 'csv'):
                self._array_to_csv(self.distHeader, self.distOutput, self.output_paths[u])

            elif ( output_type == 'stdout'):
                print("==========================================================================\n")
                print(self.distHeader)
                print("==========================================================================\n")
                for line in self.distOutput:
                    print(line)

            else:
                self._array_to_csv(self.distHeader, self.distOutput,output_type)
        else:
            #not a unique data id supported by this class
            pass
예제 #6
0
    def _get_walking_distance(self, srcLat, srcLon, destLat, destLon,db='local_database'):
        """Returns the walking distance in meters between two locations

           Parameters:
           srcLat - latitude for source location
           srcLon - longitude for source location
           destLat - latitude for destination location
           destLon - longitude for destination location
           mapbox_api_key - api key for mapbox REST services
           """

        if self.use_cached_distance == True:
            try:
                #Configure the connection
                engine = dbtools.get_database_engine(db)
                conn = dbtools.get_database_connection(db)

                #Pull columns to see if the database has updated columns in wmata_dist
                columnset = conn.execute('select column_name from INFORMATION_SCHEMA.COLUMNS where TABLE_NAME=\'wmata_dist\'')
                columns = []
                for c in columnset:
                    columns.append(c[0])

                if ( 'building_lat' in columns and 'building_lon' in columns and 'stop_or_station_lat' in columns and 'stop_or_station_lon' in columns ):
                #See if row exists
                    proj_query = 'select * from wmata_dist where building_lat=\'' + str(srcLat) + '\' and building_lon=\'' + str(srcLon) + '\' and stop_or_station_lat=\''+ str(destLat) + '\' and stop_or_station_lon=\'' + str(destLon) + '\''
                    proxy = conn.execute(proj_query)
                    results = [dict(x) for x in proxy.fetchall()]

                    if ( len(results) != 0 ):
                        logger.info("  Found cached row!")
                        walking_distance = results[0]['dist_in_miles']

                        conn.close()
                        engine.dispose()

                        return float(walking_distance)*self.meters_per_mile
                    else:
                        logger.info("  Couldn't find cached row for %s", proj_query)
                else:
                    logger.info("Couldn't find all columns")

                conn.close()
                engine.dispose()
            except Exception as e:
                logger.error("Unable to connect to the database: %s", e)

        distReqCoords = str(srcLon) + ',' + str(srcLat) + ';' + str(destLon) + ',' + str(destLat)

        mapbox_params = self.mapbox_api_key

        # according to documentation, this doesn't work in Python SDK so switched to using REST API
        walkDistResponse = requests.get("https://api.mapbox.com/directions/v5/mapbox/walking/" + distReqCoords,params=mapbox_params)
        time.sleep(0.8)
        i = 0
        while "Too Many Requests" in str(walkDistResponse.json()) and i < 10:
            walkDistResponse = requests.get("https://api.mapbox.com/directions/v5/mapbox/walking/" + distReqCoords,params=mapbox_params)
            i = i + 1
            time.sleep(0.8)
            if i == 10:
                raise Exception('This is some exception to be defined later')
        print("Return value: " ,walkDistResponse.json()['routes'][0]['legs'][0]['distance'])
        return walkDistResponse.json()['routes'][0]['legs'][0]['distance']
예제 #7
0
def main(database_choice, meta_path, manifest_path, keep_temp_files=True):
    """
    Main routine, calls all the other ones in this file as needed.

    Big picture:
        - Use manifest.csv to find out all the files we want to load
        - Compare current manifest.csv to the sql database manifest table, which
         reflects manifest.csv as of the last time this script was run, and
         tells us whether or not the file has already been loaded in this
         database.
        - Use meta.json to make sure the CSV file has all the fields we expect
        it to, and to decide the data type of each field.
        - Load the csv into the database

    If there is an error loading a file (either flagged error from fields not
    matching, or parsing error if data type isn't right):
    - skip loading this file,
    - report the error to SQL using update_sql_manifest(status="error")
    - use logging.warning() to write the specific error encountered to the
    log file
    - at the end of loading print an error message to the console
    """

    # load given meta.json and manifest.csv files into memory
    meta = ingestionfunctions.load_meta_data(meta_path)
    manifest = ManifestReader(manifest_path)

    # load given database choice and check/create sql manifest
    engine = dbtools.get_database_engine(database_choice)
    sql_manifest_exists = \
        ingestionfunctions.check_or_create_sql_manifest(engine=engine)
    logging.info("sql_manifest_exists: {}".format(sql_manifest_exists))

    #TODO should this be moved into the __init__ of ManifestReader? Do we ever want to use ManifestReader if it has duplicate rows?
    if not manifest.has_unique_ids():
        raise ValueError('Manifest has duplicate unique_data_id!')

    # Iterate through each row in the manifest then clean and validate. If it
    # passes validation, then write to temp psv file for loading into database.
    for manifest_row in manifest:
        #Incompletely filled out rows in the manifest can break the other code
        # TODO: figure out a way to flag this issue early in loading of manifest

        # only clean and validate data files flagged for use in database
        if manifest_row['include_flag'] == 'use':
            logging.info(
                "{}: preparing to load row {} from the manifest".format(
                    manifest_row['unique_data_id'], len(manifest)))

            temp_filepath = os.path.abspath(
                os.path.join(
                    logging_path,
                    'temp_{}.psv'.format(manifest_row['unique_data_id'])))
            # prepare csv reader and writer objects
            csv_reader = DataReader(meta=meta,
                                    manifest_row=manifest_row,
                                    load_from="file")
            csv_writer = CSVWriter(meta=meta,
                                   manifest_row=manifest_row,
                                   filename=temp_filepath)

            # prepare objects for interfacing with database and then get
            # the equivalent manifest row from the database
            sql_interface = HISql(meta=meta,
                                  manifest_row=manifest_row,
                                  engine=engine,
                                  filename=temp_filepath)
            sql_manifest_row = sql_interface.get_sql_manifest_row()

            #Assign an appropriate testing cleaner
            tablename = manifest_row['destination_table']
            cleaner_class_name = meta[tablename]['cleaner']
            cleaner = ingestionfunctions.get_cleaner_from_name(
                meta=meta, manifest_row=manifest_row, name=cleaner_class_name)

            # Identify fields that exist in meta.json but not CSV
            # so we can add them to the row as it is cleaned and loaded.
            meta_only_fields = {}
            for field in meta[tablename]['fields']:
                if field['source_name'] not in csv_reader.keys:
                    # adds 'sql_name',None as key,value pairs in dict
                    meta_only_fields[field['sql_name']] = None

            #clean the file and save the output to a local pipe-delimited file
            if csv_reader.should_file_be_loaded(
                    sql_manifest_row=sql_manifest_row):
                print("  Cleaning...")
                for idx, data_row in enumerate(csv_reader):
                    data_row.update(
                        meta_only_fields)  # insert other field dict
                    clean_data_row = cleaner.clean(data_row, idx)
                    if clean_data_row != None:
                        csv_writer.write(clean_data_row)

                csv_writer.close()
                print("  Loading...")

                #Decide whether to append or replace the table
                if meta[tablename]["replace_table"] == True:
                    logging.info("  replacing existing table")
                    sql_interface.drop_table()

                #Appends to table; if dropped, it recreates
                sql_interface.create_table_if_necessary()
                try:
                    sql_interface.write_file_to_sql()
                except TableWritingError:
                    #TODO tell user total count of errors.
                    #currently write_file_to_sql() just writes in log that file failed
                    pass
                if keep_temp_files == False:
                    csv_writer.remove_file()
예제 #8
0
# SETUP - Mostly taken from load_data.py
this_file = os.path.realpath(__file__)
this_dir = os.path.dirname(this_file)

meta_path = os.path.abspath(
    this_dir +
    '/test_data/meta_sample.json')  # TODO Update when meta.json is updated.
manifest_path = os.path.abspath(
    this_dir +
    '/test_data/manifest_sample.csv')  # TODO Consider updating manifest_sample
database_choice = 'docker_database'
logging_path = os.path.abspath("../logs")

meta = ingestionfunctions.load_meta_data(meta_path)
engine = dbtools.get_database_engine(database_choice)
manifest = ManifestReader(manifest_path)

FAKE_REQUIRED_FIELD = {
    'sql_name': 'foo',
    'type': 'decimal',
    'source_name': 'FAKE_REQUIRED',
    'display_text': 'Blah',
    'display_name': 'x',
    'required_in_source': True
}
FAKE_OPTIONAL_FIELD = {
    'sql_name': 'foo',
    'type': 'decimal',
    'source_name': 'FAKE_OPTIONAL',
    'display_text': 'Blah',
예제 #9
0
def main(database_choice, meta_path, manifest_path, keep_temp_files = True):
    """
    Main routine, calls all the other ones in this file as needed.

    Big picture:
        - Use manifest.csv to find out all the files we want to load
        - Compare current manifest.csv to the sql database manifest table, which reflects manifest.csv as
          of the last time this script was run, and tells us whether or not the file has already been loaded in this database.
        - Use meta.json to make sure the CSV file has all the fields we expect it to, and to decide the data type of each field
        - Load the csv into the database

    If there is an error loading a file (either flagged error from fields not matching, or parsing error if data type isn't right):
    - skip loading this file,
    - report the error to SQL using update_sql_manifest(status="error")
    - use logging.warning() to write the specific error encountered to the log file
    - at the end of loading print an error message to the console
    """
    meta = ingestionfunctions.load_meta_data(meta_path)
    engine = dbtools.get_database_engine(database_choice)
    manifest = ManifestReader(manifest_path)

    sql_manifest_exists = ingestionfunctions.check_or_create_sql_manifest(engine=engine)
    logging.info("sql_manifest_exists: {}".format(sql_manifest_exists))

    #TODO should this be moved into the __init__ of ManifestReader? Do we ever want to use ManifestReader if it has duplicate rows?
    if not manifest.has_unique_ids():
        raise ValueError('Manifest has duplicate unique_data_id!')

    for manifest_row in manifest:
        #Incompletely filled out rows in the manifest can break the other code
        if manifest_row['include_flag'] == 'use':
            logging.info("{}: preparing to load row {} from the manifest".format(manifest_row['unique_data_id'],len(manifest)))

            temp_filepath = os.path.abspath(
                                os.path.join(
                                    logging_path,
                                    'temp_{}.psv'.format(manifest_row['unique_data_id'])
                                ))
            csv_reader = DataReader(meta = meta, manifest_row=manifest_row, load_from="file")
            csv_writer = CSVWriter(meta = meta, manifest_row = manifest_row, filename = temp_filepath)
            sql_interface = HISql(meta = meta, manifest_row = manifest_row, engine = engine, filename=temp_filepath)
            sql_manifest_row = sql_interface.get_sql_manifest_row()

            #Assign an appropriate testing cleaner
            tablename = manifest_row['destination_table']
            cleaner_class_name = meta[tablename]['cleaner']
            cleaner = ingestionfunctions.get_cleaner_from_name(
                                        meta=meta, 
                                        manifest_row=manifest_row, 
                                        name= cleaner_class_name)

            #clean the file and save the output to a local pipe-delimited file
            if csv_reader.should_file_be_loaded(sql_manifest_row=sql_manifest_row):
                print("  Ready to clean {}".format(csv_reader.destination_table))
                for idx, data_row in enumerate(csv_reader):
                    clean_data_row = cleaner.clean(data_row, idx)
                    if clean_data_row != None:
                        csv_writer.write(clean_data_row)

                csv_writer.close()
                print("  Ready to load")
                
                #Decide whether to append or replace the table
                if meta[tablename]["replace_table"] == True:
                    logging.info("  replacing existing table")
                    sql_interface.drop_table()
                
                #Appends to table; if dropped, it recreates
                sql_interface.create_table()
                try:
                    sql_interface.write_file_to_sql()
                except TableWritingError:
                    #TODO tell user total count of errors. 
                    #currently write_file_to_sql() just writes in log that file failed
                    pass
                if keep_temp_files == False:
                    csv_writer.remove_file()
예제 #10
0
    :param meta: in memory JSON object representing meta.json
    :param manifest_row: the given row in manifest.csv
    :param name: the referenced cleaner class in meta.json for the table as str
    :return: a class object of the given cleaner class
    """

    #Import
    #module = import_module("module.submodule")
    Class_ = getattr(Cleaners, name)
    instance = Class_(meta, manifest_row, engine=engine)
    return instance


def join_paths(pieces=[]):
    '''
    Joins arbitrary pieces of a url or path. 
    Alternative to os.path.join if the second argument might start with "/"
    '''
    return '/'.join(s.strip('/') for s in pieces)

#Used for testing purposes
if __name__ == '__main__':

    from housinginsights.tools import dbtools
    meta_path = os.path.abspath('../../scripts/meta.json')


    meta = load_meta_data(meta_path)
    engine = dbtools.get_database_engine('docker_database')
    meta_json_to_database(engine=engine, meta=meta)