Пример #1
0
def get_new_members():
    """Pulls in a list of the most recent members of the Congregation."""
    database = Database()
    jwt_user = get_jwt_identity()

    authorized = utils.check_access(jwt_user, conf.REPORT_GROUP, database)
    utils.log_request(request, jwt_user, authorized)

    if not authorized:
        response = {
            'message': '{} does not have access to reports.'.format(jwt_user)
        }
        return jsonify(response), 403

    limit_param = request.args.get('limit')
    limit = limit_param if limit_param else 25
    new_members = database.read_table('members_view',
                                      limit=limit,
                                      order='desc',
                                      sort='membership_date')
    response = database.to_json(new_members)
    return jsonify(response)
class ParticipantMatcher:
    def __init__(self, database=None):
        daiquiri.setup(level=logging.INFO)
        self.logger = daiquiri.getLogger(__name__)

        self.database = Database() if not database else database
        self.name_resolver = NameResolver(database=self.database)
        self.participants = Participants(database=self.database)
        self.avg_event_age = {}

    def run(self, limit=1000, iters=None):
        """Adds attendees that have not been matched up to a participant
        to the look up table. If there are no matches for an attendee, a
        new participant id is created."""
        n = 0
        while True:
            missing_attendees = self._get_missing_attendees(limit=limit)
            count = len(missing_attendees)
            if (iters and n >= iters) or count == 0:
                msg = 'Participant matcher has finished processing.'
                self.logger.info(msg)
                break
            msg = 'Iteration {} | Processing {} missing attendees'
            self.logger.info(msg.format(n + 1, count))

            for i in range(count):
                attendee = dict(missing_attendees.loc[i])
                if attendee['first_name'] and attendee['last_name']:
                    self._process_attendee(attendee)
            n += 1

    def estimate_unknown_ages(self):
        """Finds estimated ages for any participant whose age is
        unknown or who has an estimated age."""
        unknowns = self._get_unknown_ages()
        for i, unknown in enumerate(unknowns):
            if i % 1000 == 0:
                msg = 'Estimated age for {} participants.'.format(i)
                self.logger.info(msg)

            estimated_age = self._estimate_participant_age(unknown['id'])
            if not estimated_age:
                continue
            now = datetime.datetime.now()
            estimated_birth_date = now - datetime.timedelta(
                estimated_age * 365)
            estimated_birth_date = "'{}'".format(
                str(estimated_birth_date)[:10])
            self.database.update_column(table='participant_match',
                                        item_id=unknown['id'],
                                        column='birth_date',
                                        value=estimated_birth_date)
            self.database.update_column(table='participant_match',
                                        item_id=unknown['id'],
                                        column='is_birth_date_estimated',
                                        value=True)

    def _process_attendee(self, attendee):
        """Adds a link to attendee_to_participant if the attendee has a match.
        Otherwise a new participant id is created for the attendee."""
        # Cache the average age for the event so it
        # doesn't have to pull it from the database each time
        event_id = attendee['event_id']
        if event_id not in self.avg_event_age:
            age = self._get_avg_event_age(event_id)
            self.avg_event_age[event_id] = age
        else:
            age = self.avg_event_age[event_id]

        match = self.name_resolver.find_best_match(
            first_name=attendee['first_name'],
            last_name=attendee['last_name'],
            email=attendee['email'],
            age=age)
        if match:
            participant_id = match['id']
        else:
            # If there is no participant match, a new participant
            # is created and added to the database
            participant_id = uuid.uuid4().hex
            participant = {
                'id': participant_id,
                'first_name': attendee['first_name'],
                'last_name': attendee['last_name'],
                'email': attendee['email']
            }
            self.database.load_item(participant, 'participant_match')

        # Insert the attendee to participant match to the database
        item = {'id': attendee['id'], 'participant_id': participant_id}
        self.database.load_item(item, 'attendee_to_participant')

    def _get_missing_attendees(self, limit=1000):
        """Pulls a list of attendees that have not yet been matched
        to a participant."""
        sql = """
            SELECT id, event_id, first_name, last_name, email
            FROM {schema}.attendees
            WHERE id NOT IN (SELECT id FROM {schema}.attendee_to_participant)
            AND first_name IS NOT NULL
            AND last_name IS NOT NULL
            ORDER BY event_id ASC
            LIMIT {limit}
        """.format(schema=self.database.schema, limit=limit)
        df = pd.read_sql(sql, self.database.connection)
        return df

    def _get_avg_event_age(self, event_id):
        """Computes the average age of the attendees of an event."""
        if not isinstance(event_id, list):
            event_id = [str(event_id)]
        else:
            event_id = [str(x) for x in event_id]

        sql = """
            SELECT PERCENTILE_CONT(0.5)
                   WITHIN GROUP 
                   (ORDER BY avg_age) as avg_age
            FROM(
                SELECT event_id,
                       PERCENTILE_CONT(0.5)
                       WITHIN GROUP
                       (ORDER BY z.age) as avg_age
                FROM {schema}.attendees x
                INNER JOIN {schema}.attendee_to_participant y
                ON x.id = y.id
                INNER JOIN {schema}.participants z
                ON y.participant_id = z.participant_id
                WHERE event_id = ANY(%(event_id)s)
                AND z.age IS NOT NULL
                GROUP BY event_id
            ) a
        """.format(schema=self.database.schema, event_id=event_id)
        df = self.database.fetch_df(sql, params={'event_id': event_id})
        avg_age = None
        if len(df) > 0:
            avg_age = df.loc[0]['avg_age']
        return avg_age

    def _estimate_participant_age(self, participant_id):
        """Estimates a participants age based on who they've
        attended events with."""
        events = self.participants.get_participant_events(participant_id)
        if len(events) == 0:
            return None
        else:
            event_id = [x['event_id'] for x in events]
        age = self._get_avg_event_age(event_id)
        return age

    def _get_unknown_ages(self):
        """Pulls all participant ids that have a null date or
        and estimated date."""
        sql = """
            SELECT id
            FROM {schema}.participant_match
            WHERE is_birth_date_estimated = TRUE
            OR birth_date IS NULL
        """.format(schema=self.database.schema)
        df = pd.read_sql(sql, self.database.connection)
        results = self.database.to_json(df)
        return results
Пример #3
0
class NameResolver():
    """Resolves the names of participants using participant characteristics."""
    def __init__(self, database=None):
        daiquiri.setup(level=logging.INFO)
        self.logger = daiquiri.getLogger(__name__)

        self.path = os.path.dirname(os.path.realpath(__file__))
        self.database = Database() if not database else database

        self.lookup = self._read_names_file()
        self.average_age = None

    def load_member_ids(self):
        """Loads member information into the participant match table.
        Only loads names that have already been loaded into the database.
        """
        sql = """
            INSERT INTO {schema}.participant_match
            (id, member_id, first_name, last_name, nickname,
             email, birth_date, is_birth_date_estimated)
            SELECT uuid_generate_v4(), id as member_id, first_name, last_name,
                   nickname, email, birth_date, false
            FROM {schema}.members
            WHERE id NOT IN (SELECT member_id FROM {schema}.participant_match)
        """.format(schema=self.database.schema)
        self.database.run_query(sql)

    def get_fuzzy_matches(self, first_name, last_name, tolerance=1):
        """Returns all names from the participants table that are within edit
        distance tolerance of the first name and last name."""
        # Add PostgreSQL escape characters
        first_name = first_name.replace("'", "''")
        last_name = last_name.replace("'", "''")
        select, conditions = self._first_name_sql(first_name, tolerance)

        sql = """
            SELECT id, member_id, first_name, last_name, nickname,
                   email, birth_date, is_birth_date_estimated
            FROM(
                SELECT *, {select}
                FROM {schema}.participant_match
            ) x
            WHERE
              ( ({conditions})
              AND last_name = '{last_name}')
        """.format(select=select, conditions=conditions,
                   schema=self.database.schema,
                   first_name=first_name, last_name=last_name,
                   tol=tolerance)
        df = pd.read_sql(sql, self.database.connection)
        results = self.database.to_json(df)
        return results

    def find_best_match(self, first_name, last_name, email=None, age=None):
        """Finds the best, given the criteria that is provide.
        If there are not matches, None will be returned."""
        matches = self.get_fuzzy_matches(first_name, last_name)
        if not self.average_age:
            self.average_age = self._get_average_age()
        if not matches:
            return None
        else:
            for match in matches:
                if not match['birth_date'] or match['birth_date'] < 0:
                    match['age'] = self.average_age
                else:
                    match['age'] = compute_age(match['birth_date'])
                match_score = compute_match_score(match,
                                                  first_name=first_name,
                                                  email=email,
                                                  age=age)
                match['match_score'] = match_score

            sorted_matches = sorted(matches, 
                                    key=lambda k: k['match_score'],
                                    reverse=True)
            return sorted_matches[0]
    
    def _get_average_age(self):
        """Pulls the average participant age. Is used if there is an
        observation that does not have an age recorded."""
        sql = """
            SELECT AVG(age) as avg_age
            FROM(
                SELECT DATE_PART('year', AGE(now(), birth_date)) as age
                FROM {schema}.participant_match
                WHERE birth_date is not null
            ) x
        """.format(schema=self.database.schema)
        df = pd.read_sql(sql, self.database.connection)
        avg_age = None
        if len(df) > 0:
            avg_age = df.loc[0]['avg_age']
        return avg_age

    def _read_names_file(self):
        """Reads the names.csv, which contains mappings of names
        to nicknames."""
        filename = os.path.join(self.path, 'names.csv')
        lookup = collections.defaultdict(list)
        with open(filename) as f:
            reader = csv.reader(f)
            for line in reader:
                matches = set(line)
                for match in matches:
                    lookup[match].append(matches)
        return lookup

    def _lookup_name(self, name):
        """Generates a sets of equivalent nicknames."""
        name = name.lower()
        if name not in self.lookup:
            return { name }
        names = functools.reduce(operator.or_, self.lookup[name])
        names.add(name)
        return names

    def _first_name_sql(self, first_name, tolerance=1):
        """Generates the select and where statments for the name
        fuzzy match."""
        nicknames = self._lookup_name(first_name)
        first_name_selects = []
        first_name_conditions = []
        for i, name in enumerate(nicknames):
            col_name = "match_first_name_{}".format(i)
            select = " lower('{}') as {} ".format(name, col_name)
            first_name_selects.append(select)
            edit_distance = """
                (levenshtein(lower(first_name), {col}) <= {tolerance}
                 OR levenshtein(lower(nickname), {col}) <= {tolerance})
            """.format(col=col_name, tolerance=tolerance)
            first_name_conditions.append(edit_distance)
        name_select = ", ".join(first_name_selects)
        name_conditions = " OR ".join(first_name_conditions)
        return name_select, name_conditions