Пример #1
0
def initialize_log_table():
    """Builds the table in the postgres database that is used for storing
    application logs."""
    database = Database(database='postgres')
    LOGGER.info('Creating the application_logs schema ...')
    schema_sql = "CREATE SCHEMA IF NOT EXISTS application_logs"
    database.run_query(schema_sql)
    table_sql = """
    CREATE TABLE IF NOT EXISTS application_logs.shir_connect_logs (
        id text,
        application_user text,
        authorized boolean,
        base_url text,
        endpoint text,
        host text,
        host_url text,
        query_string text,
        referrer text,
        remote_addr text,
        scheme text,
        url text,
        url_root text,
        user_agent text,
        load_datetime timestamp
    )
    """
    LOGGER.info('Creating the shir_connect_logs table ...')
    database.run_query(table_sql)
Пример #2
0
class MM2000:
    def __init__(self, database=None):
        daiquiri.setup(level=logging.INFO)
        self.logger = daiquiri.getLogger(__name__)

        # Load column mapping configs
        self.path = os.path.dirname(os.path.realpath(__file__))
        filename = self.path + '/member_columns.yml'
        with open(filename, 'r') as f:
            self.column_mapping = yaml.safe_load(f)

        self.database = Database() if not database else database
        self.fake_news = FakeNews(database=self.database)

    #####################################
    # Methods for loading MM2000 members
    #####################################

    def load(self, df):
        """ Loads the data in to the member database """
        self.logger.info('Parsing MM2000 data.')
        items = self.parse_mm2000(df)

        self.logger.info('Backing up current member table.')
        self.database.backup_table('members')
        self.logger.info('Truncating current member table.')
        self.database.truncate_table('members')
        self.logger.info('Loading updated member data.')
        for item in items:
            self.database.load_item(item, 'members')

        self.logger.info('Checking updated columns.')
        good_columns = self.check_columns()
        if good_columns:
            self.logger.info('Generating demo data')
            self.fake_news.fake_names()
            self.logger.info('Refreshing materialized views.')
            self.database.refresh_view('members_view')
            self.database.refresh_view('participants')
        else:
            self.logger.warning('Column mismatch in upload')
            self.database.revert_table('members')
            return False

        return True

    def parse_mm2000(self, df):
        """ Converts the MM2000 export into a list of rows """
        column_mapping = self.column_mapping['MM2000']
        items = []
        for group in column_mapping:
            column_map = column_mapping[group]['columns']
            df_group = _group_mm2000(df, column_map)

            if 'id_extension' in column_mapping[group]:
                id_extension = column_mapping[group]['id_extension']
            else:
                id_extension = None

            for i in df_group.index:
                item = dict(df_group.loc[i])
                item = _parse_postal_code(item)
                item = _check_mm2000_active(item)

                # ID extension for children and spouses
                # since a family shares the same id
                item['household_id'] = item['id']
                if id_extension:
                    item['id'] += id_extension

                # Remove invalid birthdates
                item = _parse_mm2000_date(item, 'birth_date')
                item = _parse_mm2000_date(item, 'membership_date')

                # Skip if the member is under the minimum age
                # that we keep in the database
                too_young = utils.check_age(item['birth_date'], min_age=18)
                if too_young:
                    continue

                # Children only have a full name, not separate
                # first names and last name
                if 'first_name' not in item and item['full_name']:
                    item['first_name'] = item['full_name'].split()[0]
                if 'last_name' not in item and item['full_name']:
                    item['last_name'] = item['full_name'].split()[0]
                if not item['first_name'] or not item['last_name']:
                    continue
                else:
                    items.append(item)
        return items

    def check_columns(self):
        """ Checks to make sure the columns are the same in the new table """
        new_columns = self.database.get_columns('members')
        old_columns = self.database.get_columns('members_backup')
        for column in new_columns:
            if column not in old_columns:
                return False
        return True

    ###########################################
    # Methods for handling MM2000 resignations
    ###########################################

    def load_resignations(self, df):
        """Loads MM2000 resignation data into the database."""
        _validate_resignation_data(df)
        # Map the file column names to the databse column names
        df = df.rename(columns=self.column_mapping['MM2000 Resignations'])
        # Drop any rows where the resignation date is null
        df = df.dropna(axis=0, how='any', subset=['resignation_date'])
        for i in df.index:
            member = dict(df.loc[i])
            member = _parse_mm2000_date(member, 'resignation_date')
            resignation_date = str(member['resignation_date'])[:10]
            # TODO: This logic is specific to TRS because that's how they
            # track people who rejoined the congregation. We may have to
            # update this if another client uses MM2000
            if 'Comment1' in member:
                if 'rejoin' in str(member['Comment1']).lower():
                    resignation_date = None
            if 'Comment2' in member:
                if 'rejoin' in str(member['Comment2']).lower():
                    resignation_date = None

            if resignation_date:
                resignation_date = "'{}'".format(resignation_date)
                sql = """
                    UPDATE {schema}.members
                    SET resignation_date = {resignation_date}
                    WHERE (household_id = '{member_id}'
                        OR id = '{member_id}')
                """.format(schema=self.database.schema,
                           resignation_date=resignation_date,
                           member_id=member['id'])
                self.database.run_query(sql)

                reason = _find_resignation_reason(member['resignation_reason'])
                sql = """
                    UPDATE {schema}.members
                    SET resignation_reason = '{reason}'
                    WHERE (household_id = '{member_id}'
                        OR id = '{member_id}')
                """.format(schema=self.database.schema,
                           reason=reason,
                           member_id=member['id'])
                self.database.run_query(sql)

        self.database.refresh_views()
Пример #3
0
class NameResolver():
    """Resolves the names of participants using participant characteristics."""
    def __init__(self, database=None):
        daiquiri.setup(level=logging.INFO)
        self.logger = daiquiri.getLogger(__name__)

        self.path = os.path.dirname(os.path.realpath(__file__))
        self.database = Database() if not database else database

        self.lookup = self._read_names_file()
        self.average_age = None

    def load_member_ids(self):
        """Loads member information into the participant match table.
        Only loads names that have already been loaded into the database.
        """
        sql = """
            INSERT INTO {schema}.participant_match
            (id, member_id, first_name, last_name, nickname,
             email, birth_date, is_birth_date_estimated)
            SELECT uuid_generate_v4(), id as member_id, first_name, last_name,
                   nickname, email, birth_date, false
            FROM {schema}.members
            WHERE id NOT IN (SELECT member_id FROM {schema}.participant_match)
        """.format(schema=self.database.schema)
        self.database.run_query(sql)

    def get_fuzzy_matches(self, first_name, last_name, tolerance=1):
        """Returns all names from the participants table that are within edit
        distance tolerance of the first name and last name."""
        # Add PostgreSQL escape characters
        first_name = first_name.replace("'", "''")
        last_name = last_name.replace("'", "''")
        select, conditions = self._first_name_sql(first_name, tolerance)

        sql = """
            SELECT id, member_id, first_name, last_name, nickname,
                   email, birth_date, is_birth_date_estimated
            FROM(
                SELECT *, {select}
                FROM {schema}.participant_match
            ) x
            WHERE
              ( ({conditions})
              AND last_name = '{last_name}')
        """.format(select=select, conditions=conditions,
                   schema=self.database.schema,
                   first_name=first_name, last_name=last_name,
                   tol=tolerance)
        df = pd.read_sql(sql, self.database.connection)
        results = self.database.to_json(df)
        return results

    def find_best_match(self, first_name, last_name, email=None, age=None):
        """Finds the best, given the criteria that is provide.
        If there are not matches, None will be returned."""
        matches = self.get_fuzzy_matches(first_name, last_name)
        if not self.average_age:
            self.average_age = self._get_average_age()
        if not matches:
            return None
        else:
            for match in matches:
                if not match['birth_date'] or match['birth_date'] < 0:
                    match['age'] = self.average_age
                else:
                    match['age'] = compute_age(match['birth_date'])
                match_score = compute_match_score(match,
                                                  first_name=first_name,
                                                  email=email,
                                                  age=age)
                match['match_score'] = match_score

            sorted_matches = sorted(matches, 
                                    key=lambda k: k['match_score'],
                                    reverse=True)
            return sorted_matches[0]
    
    def _get_average_age(self):
        """Pulls the average participant age. Is used if there is an
        observation that does not have an age recorded."""
        sql = """
            SELECT AVG(age) as avg_age
            FROM(
                SELECT DATE_PART('year', AGE(now(), birth_date)) as age
                FROM {schema}.participant_match
                WHERE birth_date is not null
            ) x
        """.format(schema=self.database.schema)
        df = pd.read_sql(sql, self.database.connection)
        avg_age = None
        if len(df) > 0:
            avg_age = df.loc[0]['avg_age']
        return avg_age

    def _read_names_file(self):
        """Reads the names.csv, which contains mappings of names
        to nicknames."""
        filename = os.path.join(self.path, 'names.csv')
        lookup = collections.defaultdict(list)
        with open(filename) as f:
            reader = csv.reader(f)
            for line in reader:
                matches = set(line)
                for match in matches:
                    lookup[match].append(matches)
        return lookup

    def _lookup_name(self, name):
        """Generates a sets of equivalent nicknames."""
        name = name.lower()
        if name not in self.lookup:
            return { name }
        names = functools.reduce(operator.or_, self.lookup[name])
        names.add(name)
        return names

    def _first_name_sql(self, first_name, tolerance=1):
        """Generates the select and where statments for the name
        fuzzy match."""
        nicknames = self._lookup_name(first_name)
        first_name_selects = []
        first_name_conditions = []
        for i, name in enumerate(nicknames):
            col_name = "match_first_name_{}".format(i)
            select = " lower('{}') as {} ".format(name, col_name)
            first_name_selects.append(select)
            edit_distance = """
                (levenshtein(lower(first_name), {col}) <= {tolerance}
                 OR levenshtein(lower(nickname), {col}) <= {tolerance})
            """.format(col=col_name, tolerance=tolerance)
            first_name_conditions.append(edit_distance)
        name_select = ", ".join(first_name_selects)
        name_conditions = " OR ".join(first_name_conditions)
        return name_select, name_conditions