def initialize_log_table(): """Builds the table in the postgres database that is used for storing application logs.""" database = Database(database='postgres') LOGGER.info('Creating the application_logs schema ...') schema_sql = "CREATE SCHEMA IF NOT EXISTS application_logs" database.run_query(schema_sql) table_sql = """ CREATE TABLE IF NOT EXISTS application_logs.shir_connect_logs ( id text, application_user text, authorized boolean, base_url text, endpoint text, host text, host_url text, query_string text, referrer text, remote_addr text, scheme text, url text, url_root text, user_agent text, load_datetime timestamp ) """ LOGGER.info('Creating the shir_connect_logs table ...') database.run_query(table_sql)
class MM2000: def __init__(self, database=None): daiquiri.setup(level=logging.INFO) self.logger = daiquiri.getLogger(__name__) # Load column mapping configs self.path = os.path.dirname(os.path.realpath(__file__)) filename = self.path + '/member_columns.yml' with open(filename, 'r') as f: self.column_mapping = yaml.safe_load(f) self.database = Database() if not database else database self.fake_news = FakeNews(database=self.database) ##################################### # Methods for loading MM2000 members ##################################### def load(self, df): """ Loads the data in to the member database """ self.logger.info('Parsing MM2000 data.') items = self.parse_mm2000(df) self.logger.info('Backing up current member table.') self.database.backup_table('members') self.logger.info('Truncating current member table.') self.database.truncate_table('members') self.logger.info('Loading updated member data.') for item in items: self.database.load_item(item, 'members') self.logger.info('Checking updated columns.') good_columns = self.check_columns() if good_columns: self.logger.info('Generating demo data') self.fake_news.fake_names() self.logger.info('Refreshing materialized views.') self.database.refresh_view('members_view') self.database.refresh_view('participants') else: self.logger.warning('Column mismatch in upload') self.database.revert_table('members') return False return True def parse_mm2000(self, df): """ Converts the MM2000 export into a list of rows """ column_mapping = self.column_mapping['MM2000'] items = [] for group in column_mapping: column_map = column_mapping[group]['columns'] df_group = _group_mm2000(df, column_map) if 'id_extension' in column_mapping[group]: id_extension = column_mapping[group]['id_extension'] else: id_extension = None for i in df_group.index: item = dict(df_group.loc[i]) item = _parse_postal_code(item) item = _check_mm2000_active(item) # ID extension for children and spouses # since a family shares the same id item['household_id'] = item['id'] if id_extension: item['id'] += id_extension # Remove invalid birthdates item = _parse_mm2000_date(item, 'birth_date') item = _parse_mm2000_date(item, 'membership_date') # Skip if the member is under the minimum age # that we keep in the database too_young = utils.check_age(item['birth_date'], min_age=18) if too_young: continue # Children only have a full name, not separate # first names and last name if 'first_name' not in item and item['full_name']: item['first_name'] = item['full_name'].split()[0] if 'last_name' not in item and item['full_name']: item['last_name'] = item['full_name'].split()[0] if not item['first_name'] or not item['last_name']: continue else: items.append(item) return items def check_columns(self): """ Checks to make sure the columns are the same in the new table """ new_columns = self.database.get_columns('members') old_columns = self.database.get_columns('members_backup') for column in new_columns: if column not in old_columns: return False return True ########################################### # Methods for handling MM2000 resignations ########################################### def load_resignations(self, df): """Loads MM2000 resignation data into the database.""" _validate_resignation_data(df) # Map the file column names to the databse column names df = df.rename(columns=self.column_mapping['MM2000 Resignations']) # Drop any rows where the resignation date is null df = df.dropna(axis=0, how='any', subset=['resignation_date']) for i in df.index: member = dict(df.loc[i]) member = _parse_mm2000_date(member, 'resignation_date') resignation_date = str(member['resignation_date'])[:10] # TODO: This logic is specific to TRS because that's how they # track people who rejoined the congregation. We may have to # update this if another client uses MM2000 if 'Comment1' in member: if 'rejoin' in str(member['Comment1']).lower(): resignation_date = None if 'Comment2' in member: if 'rejoin' in str(member['Comment2']).lower(): resignation_date = None if resignation_date: resignation_date = "'{}'".format(resignation_date) sql = """ UPDATE {schema}.members SET resignation_date = {resignation_date} WHERE (household_id = '{member_id}' OR id = '{member_id}') """.format(schema=self.database.schema, resignation_date=resignation_date, member_id=member['id']) self.database.run_query(sql) reason = _find_resignation_reason(member['resignation_reason']) sql = """ UPDATE {schema}.members SET resignation_reason = '{reason}' WHERE (household_id = '{member_id}' OR id = '{member_id}') """.format(schema=self.database.schema, reason=reason, member_id=member['id']) self.database.run_query(sql) self.database.refresh_views()
class NameResolver(): """Resolves the names of participants using participant characteristics.""" def __init__(self, database=None): daiquiri.setup(level=logging.INFO) self.logger = daiquiri.getLogger(__name__) self.path = os.path.dirname(os.path.realpath(__file__)) self.database = Database() if not database else database self.lookup = self._read_names_file() self.average_age = None def load_member_ids(self): """Loads member information into the participant match table. Only loads names that have already been loaded into the database. """ sql = """ INSERT INTO {schema}.participant_match (id, member_id, first_name, last_name, nickname, email, birth_date, is_birth_date_estimated) SELECT uuid_generate_v4(), id as member_id, first_name, last_name, nickname, email, birth_date, false FROM {schema}.members WHERE id NOT IN (SELECT member_id FROM {schema}.participant_match) """.format(schema=self.database.schema) self.database.run_query(sql) def get_fuzzy_matches(self, first_name, last_name, tolerance=1): """Returns all names from the participants table that are within edit distance tolerance of the first name and last name.""" # Add PostgreSQL escape characters first_name = first_name.replace("'", "''") last_name = last_name.replace("'", "''") select, conditions = self._first_name_sql(first_name, tolerance) sql = """ SELECT id, member_id, first_name, last_name, nickname, email, birth_date, is_birth_date_estimated FROM( SELECT *, {select} FROM {schema}.participant_match ) x WHERE ( ({conditions}) AND last_name = '{last_name}') """.format(select=select, conditions=conditions, schema=self.database.schema, first_name=first_name, last_name=last_name, tol=tolerance) df = pd.read_sql(sql, self.database.connection) results = self.database.to_json(df) return results def find_best_match(self, first_name, last_name, email=None, age=None): """Finds the best, given the criteria that is provide. If there are not matches, None will be returned.""" matches = self.get_fuzzy_matches(first_name, last_name) if not self.average_age: self.average_age = self._get_average_age() if not matches: return None else: for match in matches: if not match['birth_date'] or match['birth_date'] < 0: match['age'] = self.average_age else: match['age'] = compute_age(match['birth_date']) match_score = compute_match_score(match, first_name=first_name, email=email, age=age) match['match_score'] = match_score sorted_matches = sorted(matches, key=lambda k: k['match_score'], reverse=True) return sorted_matches[0] def _get_average_age(self): """Pulls the average participant age. Is used if there is an observation that does not have an age recorded.""" sql = """ SELECT AVG(age) as avg_age FROM( SELECT DATE_PART('year', AGE(now(), birth_date)) as age FROM {schema}.participant_match WHERE birth_date is not null ) x """.format(schema=self.database.schema) df = pd.read_sql(sql, self.database.connection) avg_age = None if len(df) > 0: avg_age = df.loc[0]['avg_age'] return avg_age def _read_names_file(self): """Reads the names.csv, which contains mappings of names to nicknames.""" filename = os.path.join(self.path, 'names.csv') lookup = collections.defaultdict(list) with open(filename) as f: reader = csv.reader(f) for line in reader: matches = set(line) for match in matches: lookup[match].append(matches) return lookup def _lookup_name(self, name): """Generates a sets of equivalent nicknames.""" name = name.lower() if name not in self.lookup: return { name } names = functools.reduce(operator.or_, self.lookup[name]) names.add(name) return names def _first_name_sql(self, first_name, tolerance=1): """Generates the select and where statments for the name fuzzy match.""" nicknames = self._lookup_name(first_name) first_name_selects = [] first_name_conditions = [] for i, name in enumerate(nicknames): col_name = "match_first_name_{}".format(i) select = " lower('{}') as {} ".format(name, col_name) first_name_selects.append(select) edit_distance = """ (levenshtein(lower(first_name), {col}) <= {tolerance} OR levenshtein(lower(nickname), {col}) <= {tolerance}) """.format(col=col_name, tolerance=tolerance) first_name_conditions.append(edit_distance) name_select = ", ".join(first_name_selects) name_conditions = " OR ".join(first_name_conditions) return name_select, name_conditions