示例#1
0
        def decorated(*args, **kwargs):
            api_key = None
            resp = {
                'status': 'ok',
                'message': ''
            }
            status_code = 200
            if flask_session.get('user_id'):
                api_key = flask_session['user_id']
            elif request.form.get('api_key'):
                api_key = request.form['api_key']
            elif request.args.get('api_key'):
                api_key = request.args['api_key']
            else:
                try:
                    api_key = json.loads(request.data).get('api_key')
                except ValueError:
                    api_key = None
            if not api_key:
                resp['status'] = 'error'
                resp['message'] = "'api_key' is a required parameter"
                status_code = 401
                response = make_response(json.dumps(resp), status_code)
                response.headers['Content-Type'] = 'application/json'
                return response
            else:
                user = db_session.query(User).get(api_key)

                sess = db_session.query(DedupeSession)\
                    .filter(DedupeSession.group.has(
                        Group.id.in_([i.id for i in user.groups])))\
                    .all()
                flask_session['user_sessions'] = [s.id for s in sess]
                flask_session['api_key'] = api_key

                dedupe_session = None
                if request.args.get('session_id'):
                    session_id = request.args['session_id']
                    flask_session['session_id'] = request.args['session_id']
                elif flask_session.get('session_id'):
                    session_id = flask_session['session_id']
                else:
                    flash("Sorry, could not find a session id")
                    return redirect(url_for('admin.index'))
                if flask_session['session_id'] not in flask_session['user_sessions']:
                    flash("Sorry, you don't have access to that session")
                    return redirect(url_for('admin.index'))
                
            return f(*args, **kwargs)
示例#2
0
def delete_session():

    session_id = flask_session['session_id']
    data = db_session.query(DedupeSession).get(session_id)
    db_session.delete(data)
    db_session.commit()
    tables = [
        'entity_{0}',
        'entity_{0}_cr',
        'raw_{0}',
        'processed_{0}',
        'processed_{0}_cr',
        'block_{0}',
        'block_{0}_cr',
        'plural_block_{0}',
        'plural_block_{0}_cr',
        'cr_{0}',
        'covered_{0}',
        'covered_{0}_cr',
        'plural_key_{0}',
        'plural_key_{0}_cr',
        'small_cov_{0}',
        'small_cov_{0}_cr',
        'canon_{0}',
        'exact_match_{0}',
        'match_blocks_{0}',
    ]
    cleanupTables.delay(session_id, tables=tables)
    resp = make_response(json.dumps({
        'session_id': session_id,
        'status': 'ok'
    }))
    resp.headers['Content-Type'] = 'application/json'
    return resp
def mark_pair():
    action = request.args['action']
    flask_session['last_interaction'] = datetime.now()
    counter = flask_session.get('counter')
    sess = db_session.query(DedupeSession).get(flask_session['session_id'])
    deduper = flask_session['deduper']

    # Attempt to cast the training input appropriately
    # TODO: Figure out LatLong type
    field_defs = json.loads(sess.field_defs)
    fds = {}
    for fd in field_defs:
        try:
            fds[fd['field']].append(fd['type'])
        except KeyError:
            fds[fd['field']] = [fd['type']]
    current_pair = flask_session['current_pair']
    left, right = current_pair
    l_d = {}
    r_d = {}
    for k, v in left.items():
        if 'Price' in fds[k]:
            l_d[k] = float(v)
        else:
            l_d[k] = v
    for k, v in right.items():
        if 'Price' in fds[k]:
            r_d[k] = float(v)
        else:
            r_d[k] = v
    current_pair = [l_d, r_d]
    if sess.training_data:
        labels = json.loads(sess.training_data)
    else:
        labels = {'distinct': [], 'match': []}
    if action == 'yes':
        labels['match'].append(current_pair)
        counter['yes'] += 1
        resp = {'counter': counter}
    elif action == 'no':
        labels['distinct'].append(current_pair)
        counter['no'] += 1
        resp = {'counter': counter}
    elif action == 'finish':
        dedupeRaw.delay(flask_session['session_id'])
        resp = {'finished': True}
        flask_session['dedupe_start'] = time.time()
    else:
        counter['unsure'] += 1
        flask_session['counter'] = counter
        resp = {'counter': counter}
    sess.training_data = json.dumps(labels, default=_to_json)
    db_session.add(sess)
    db_session.commit()
    deduper.markPairs(labels)
    if resp.get('finished'):
        del flask_session['deduper']
    resp = make_response(json.dumps(resp))
    resp.headers['Content-Type'] = 'application/json'
    return resp
def upload():
    session_id = unicode(uuid4())
    f = request.files['input_file']
    flask_session['session_name'] = f.filename
    file_type = f.filename.rsplit('.')[1]
    u = StringIO(f.read())
    u.seek(0)
    if file_type != 'csv':  # pragma: no cover
        file_format = convert.guess_format(flask_session['session_name'])
        u = StringIO(convert.convert(u, file_format))
    fieldnames = [
        slugify(unicode(i)) for i in u.next().strip('\r\n').split(',')
    ]
    flask_session['fieldnames'] = fieldnames
    user_id = flask_session['user_id']
    user = db_session.query(User).get(user_id)
    group = user.groups[0]
    sess = DedupeSession(id=session_id,
                         name=request.form.get('name'),
                         description=request.form.get('description'),
                         filename=f.filename,
                         group=group,
                         status=STATUS_LIST[0]['machine_name'])
    db_session.add(sess)
    db_session.commit()
    u.seek(0)
    with open('/tmp/%s_raw.csv' % session_id, 'wb') as s:
        s.write(u.getvalue())
    del u
    initializeSession.delay(session_id)
    flask_session['session_id'] = session_id
    return jsonify(ready=True, session_id=session_id)
示例#5
0
def get_unmatched():
    resp = {
        'status': 'ok',
        'message': '',
        'object': {},
        'remaining': 0,
    }
    status_code = 200
    session_id = flask_session['session_id']

    dedupe_session = db_session.query(DedupeSession).get(session_id)
    resp['remaining'] = dedupe_session.review_count
    raw_fields = list(set([f['field'] for f in json.loads(dedupe_session.field_defs)]))
    raw_fields.append('record_id')
    fields = ', '.join(['r.{0}'.format(f) for f in raw_fields])
    sel = ''' 
      SELECT {0}
      FROM "raw_{1}" as r
      LEFT JOIN "entity_{1}" as e
        ON r.record_id = e.record_id
      WHERE e.record_id IS NULL
      LIMIT 1
    '''.format(fields, session_id)
    engine = db_session.bind
    with engine.begin() as conn:
        rows = [dict(zip(raw_fields, r)) for r in conn.execute(sel)]
    if not rows:
        dedupe_session.status = 'canonical'
        db_session.add(dedupe_session)
        db_session.commit()
    else:
        resp['object'] = rows[0]
    response = make_response(json.dumps(resp), status_code)
    response.headers['Content-Type'] = 'application/json'
    return response
示例#6
0
def get_canon_cluster():
    resp = {
        'status': 'ok',
        'message': '',
        'objects': [],
    }
    status_code = 200
    session_id = flask_session['session_id']

    checkinSessions()
    dedupe_session = db_session.query(DedupeSession).get(session_id)
    entity_id, cluster, prediction = getCluster(session_id, 'entity_{0}_cr',
                                                'cr_{0}')
    if cluster:
        resp['entity_id'] = entity_id
        resp['objects'] = cluster
        resp['prediction'] = prediction
    else:
        getMatchingReady.delay(session_id)
    resp['total_clusters'] = dedupe_session.entity_count
    resp['review_remainder'] = dedupe_session.review_count

    response = make_response(json.dumps(resp), status_code)
    response.headers['Content-Type'] = 'application/json'
    return response
示例#7
0
def field_definitions():
    session_id = flask_session['session_id']
    data = db_session.query(DedupeSession).get(session_id)
    field_defs = data.field_defs

    resp = make_response(field_defs, 200)
    resp.headers['Content-Type'] = 'application/json'
    return resp
示例#8
0
def settings_file():
    session_id = flask_session['session_id']
    data = db_session.query(DedupeSession).get(session_id)
    settings_file = data.settings_file
    resp = make_response(settings_file, 200)
    resp.headers[
        'Content-Disposition'] = 'attachment; filename=%s.dedupe_settings' % data.id
    return resp
示例#9
0
    def validate(self):
        rv = Form.validate(self)
        if not rv:
            return False

        existing_name = db_session.query(User)\
            .filter(User.name == self.name.data).first()
        if existing_name:
            self.name.errors.append('Name is already registered')
            return False

        existing_email = db_session.query(User)\
            .filter(User.email == self.email.data).first()
        if existing_email:
            self.email.errors.append('Email address is already registered')
            return False

        return True
示例#10
0
def clear_error():
    work_id = request.args['work_id']
    work = db_session.query(WorkTable).get(work_id)
    work.cleared = True
    db_session.add(work)
    db_session.commit()
    response = make_response(json.dumps({'status': 'ok'}))
    response.headers['Content-Type'] = 'application/json'
    return response
示例#11
0
def getDistinct(field_name, session_id):
    engine = app_session.bind
    metadata = MetaData()
    table = Table('processed_%s' % session_id, metadata,
        autoload=True, autoload_with=engine)
    col = getattr(table.c, field_name)
    q = app_session.query(distinct(col)).filter(and_(col != None, col != ''))
    distinct_values = list(set([unicode(v[0]) for v in q.all()]))
    return distinct_values
示例#12
0
def mark_all_canon_cluster():
    resp = {}
    status_code = 200
    session_id = flask_session['session_id']
    user = db_session.query(User).get(flask_session['api_key'])
    bulkMarkCanonClusters.delay(session_id, user=user.name)

    resp = make_response(json.dumps(resp), status_code)
    resp.headers['Content-Type'] = 'application/json'
    return resp
示例#13
0
def select_field_types():
    dedupe_session = db_session.query(DedupeSession).get(
        flask_session['session_id'])
    errors = db_session.query(WorkTable)\
            .filter(WorkTable.session_id == dedupe_session.id)\
            .filter(WorkTable.cleared == False)\
            .all()
    errors = [e.value for e in errors]
    field_list = flask_session['field_list']
    if request.method == 'POST':
        field_defs = []
        form = {}
        for k in request.form.keys():
            if k != 'csrf_token':
                form[k] = request.form.getlist(k)
        ftypes = sorted(form.items())
        for k, g in groupby(ftypes, key=lambda x: x[0].rsplit('_', 1)[0]):
            vals = list(g)
            has_missing = False
            for ftype, val in vals:
                if ftype == '{0}_missing'.format(k):
                    has_missing = True
            fs = []
            for field, val in vals:
                fs.extend([{'field': k, 'type': val[i]} \
                    for i in range(len(val)) if field.endswith('type')])
            for f in fs:
                if has_missing:
                    f.update({'has_missing': True})
            field_defs.extend(fs)
        dedupe_session = db_session.query(DedupeSession).get(
            flask_session['session_id'])
        dedupe_session.field_defs = json.dumps(field_defs)
        dedupe_session.status = 'model defined'
        db_session.add(dedupe_session)
        db_session.commit()
        if not errors:
            initializeModel.delay(dedupe_session.id)
        return redirect(url_for('trainer.training_run'))
    return render_template('dedupe_session/select_field_types.html',
                           field_list=field_list,
                           dedupe_session=dedupe_session,
                           errors=errors)
示例#14
0
def training_data():

    session_id = flask_session['session_id']
    data = db_session.query(DedupeSession).get(session_id)
    training_data = data.training_data

    resp = make_response(training_data, 200)
    resp.headers['Content-Type'] = 'text/plain'
    resp.headers[
        'Content-Disposition'] = 'attachment; filename=%s_training.json' % data.id
    return resp
示例#15
0
def mark_all_clusters():
    resp = {'status': 'ok', 'message': ''}
    status_code = 200

    session_id = flask_session['session_id']
    # Need to update existing clusters with new entity_id here, too.
    user = db_session.query(User).get(flask_session['api_key'])
    bulkMarkClusters.delay(session_id, user=user.name)

    response = make_response(json.dumps(resp), status_code)
    response.headers['Content-Type'] = 'application/json'
    return response
示例#16
0
def session_review():
    first_review = True
    if request.args.get('second_review'):
        first_review = False

    dedupe_session = db_session.query(DedupeSession).get(
        flask_session['session_id'])

    return render_template('dedupe_session/session-review.html',
                           session_id=flask_session['session_id'],
                           first_review=first_review,
                           dedupe_session=dedupe_session)
示例#17
0
def getCluster(session_id, entity_pattern, raw_pattern):
    ent_name = entity_pattern.format(session_id)
    raw_name = raw_pattern.format(session_id)
    sess = app_session.query(DedupeSession).get(session_id)
    app_session.refresh(sess)
    
    cluster_list = []
    prediction = None
    machine = cPickle.loads(sess.review_machine)
    entity_id = machine.get_next()
    sess.review_machine = cPickle.dumps(machine)
    app_session.add(sess)
    app_session.commit()
    engine = app_session.bind
    model_fields = list(set([f['field'] for f in json.loads(sess.field_defs)]))
    raw_cols = ', '.join(['r.{0}'.format(f) for f in model_fields])
    sel = text('''
        SELECT 
            e.confidence,
            {0},
            r.record_id
        FROM "{1}" AS r
        JOIN "{2}" as e 
            ON r.record_id = e.record_id
        WHERE e.entity_id = :entity_id
        ORDER BY e.confidence
        '''.format(raw_cols, raw_name, ent_name))
    records = list(engine.execute(sel, entity_id=entity_id))

    if records:
        raw_fields = ['confidence'] + model_fields + ['record_id']
        max_confidence = max([r['confidence'] for r in records])
        cluster_length = len(records)
        prediction = machine.predict([max_confidence, cluster_length])
        for thing in records:
            d = {}
            for k,v in zip(raw_fields, thing):
                d[k] = v

            # d['confidence'] = formatPercentage(d['confidence'])
            cluster_list.append(d)
        one_minute = datetime.now() + timedelta(minutes=1)
        upd = text(''' 
            UPDATE "{0}" SET
              checked_out = TRUE,
              checkout_expire = :one_minute
            WHERE entity_id = :entity_id
            '''.format(ent_name))
        with engine.begin() as c:
            c.execute(upd, entity_id=entity_id, one_minute=one_minute)
        return entity_id, cluster_list, prediction
    else:
        return None, None, None
示例#18
0
 def decorated(*args, **kwargs):
     user_id = flask_session.get('user_id')
     if not user_id:
         return redirect(url_for('auth.login'))
     user = db_session.query(User).get(user_id)
     user_roles = set([r.name for r in user.roles])
     rs = set(roles)
     if user_roles.issubset(rs):
         return f(*args, **kwargs)
     else:
         flash('Sorry, you don\'t have access to that page')
         return redirect(url_for('admin.index'))
示例#19
0
def select_fields():
    status_code = 200
    errors = []
    dedupe_session = db_session.query(DedupeSession).get(
        flask_session['session_id'])
    fields = flask_session.get('fieldnames')
    # If the fields are not in the session, that means that the user has come
    # here directly from the home page. We'll try to load them from the raw
    # table in the database but if that does not exist yet (which is possible)
    # then we'll redirect them to the home page.
    if not fields:
        meta = MetaData()
        engine = db_session.bind
        try:
            raw = Table('raw_{0}'.format(flask_session['session_id']),
                        meta,
                        autoload=True,
                        autoload_with=engine,
                        keep_existing=True)
            fields = [r for r in raw.columns.keys() if r != 'record_id']
            flask_session['fieldnames'] = fields
        except NoSuchTableError:
            return redirect(url_for('admin.index'))
    errors = db_session.query(WorkTable)\
            .filter(WorkTable.session_id == dedupe_session.id)\
            .filter(WorkTable.cleared == False)\
            .all()
    if request.method == 'POST':
        field_list = [r for r in request.form if r != 'csrf_token']
        flask_session['field_list'] = field_list
        if field_list:
            return redirect(url_for('trainer.select_field_types'))
        else:
            errors = ['You must select at least one field to compare on.']
            status_code = 400

    return render_template('dedupe_session/select_fields.html',
                           errors=errors,
                           fields=fields,
                           dedupe_session=dedupe_session)
示例#20
0
def training_run():
    dedupe_session = db_session.query(DedupeSession).get(
        flask_session['session_id'])

    if dedupe_session.training_data:
        td = json.loads(dedupe_session.training_data)
        flask_session['counter'] = {
            'yes': len(td['match']),
            'no': len(td['distinct']),
            'unsure': 0
        }
    else:
        flask_session['counter'] = {
            'yes': 0,
            'no': 0,
            'unsure': 0,
        }
    errors = db_session.query(WorkTable)\
            .filter(WorkTable.session_id == dedupe_session.id)\
            .filter(WorkTable.cleared == False)\
            .all()
    if not errors:
        status_code = 200
        field_defs = json.loads(dedupe_session.field_defs)
        if dedupe_session.sample:
            sample = cPickle.loads(dedupe_session.sample)
            deduper = dedupe.Dedupe(field_defs, data_sample=sample)
            flask_session['deduper'] = deduper
    else:
        status_code = 500
    time.sleep(1)
    db_session.refresh(dedupe_session)
    return make_response(
        render_template('dedupe_session/training_run.html',
                        errors=errors,
                        dedupe_session=dedupe_session), status_code)
示例#21
0
def checkinSessions():
    now = datetime.now()
    all_sessions = [i.id for i in app_session.query(DedupeSession.id).all()]
    engine = init_engine(current_app.config['DB_CONN'])
    for sess_id in all_sessions:
        try:
            table = Table('entity_%s' % sess_id, Base.metadata, 
                autoload=True, autoload_with=engine)
            upd = table.update().where(table.c.checkout_expire <= now)\
                .where(table.c.clustered == False)\
                .values(checked_out = False, checkout_expire = None)
            with engine.begin() as c:
                c.execute(upd)
        except NoSuchTableError: # pragma: no cover 
            pass
    return None
示例#22
0
    def validate(self):
        rv = Form.validate(self)
        if not rv:
            return False

        user = db_session.query(User)\
            .filter(func.lower(User.email) == func.lower(self.email.data))\
            .first()
        if user is None:
            self.email.errors.append('Email address is not registered')
            return False

        if not user.check_password(user.name, self.password.data):
            self.password.errors.append('Password is not valid')
            return False

        self.user = user
        return True
示例#23
0
def validate_post(post):
    session_id = post.get('session_id')
    obj = post.get('object')
    r = {'status': 'ok', 'message': '', 'object': obj}
    status_code = 200
    sess = db_session.query(DedupeSession).get(session_id)
    if not session_id:
        r['status'] = 'error'
        r['message'] = 'Session ID is required'
        status_code = 401
    elif not obj:
        r['status'] = 'error'
        r['message'] = 'Match object is required'
        status_code = 400
    elif not sess:
        r['status'] = 'error'
        r['message'] = 'Invalid Session ID'
        status_code = 400
    return r, status_code, sess
示例#24
0
def delete_data_model():

    session_id = flask_session['session_id']
    dedupe_session = db_session.query(DedupeSession).get(session_id)
    dedupe_session.field_defs = None
    dedupe_session.training_data = None
    dedupe_session.sample = None
    dedupe_session.status = 'dataset uploaded'
    db_session.add(dedupe_session)
    db_session.commit()
    tables = [
        'entity_{0}',
        'block_{0}',
        'plural_block_{0}',
        'covered_{0}',
        'plural_key_{0}',
        'small_cov_{0}',
    ]
    engine = db_session.bind
    for table in tables:  # pragma: no cover
        try:
            data_table = Table(table.format(session_id),
                               Base.metadata,
                               autoload=True,
                               autoload_with=engine)
            data_table.drop(engine)
        except NoSuchTableError:
            pass
        except ProgrammingError:
            pass
    resp = {
        'status': 'ok',
        'message': 'Data model for session {0} deleted'.format(session_id)
    }
    status_code = 200

    resp = make_response(json.dumps(resp), status_code)
    resp.headers['Content-Type'] = 'application/json'
    return resp
示例#25
0
def mark_canon_cluster():
    session_id = flask_session['session_id']

    if not request.args.get('entity_id'):
        resp = {
            'status': 'error',
            'message': '"entity_id" is a required parameter'
        }
        status_code = 400
    else:
        entity_id = request.args.get('entity_id')
        match_ids = request.args.get('match_ids')
        distinct_ids = request.args.get('distinct_ids')
        user = db_session.query(User).get(flask_session['api_key'])
        engine = db_session.bind
        if match_ids:
            match_ids = tuple([d for d in match_ids.split(',')])
            upd = text('''
                UPDATE "entity_{0}" SET 
                    entity_id = :entity_id,
                    clustered = :clustered,
                    checked_out = :checked_out,
                    last_update = :last_update,
                    reviewer = :user_name
                WHERE entity_id in (
                    SELECT record_id 
                        FROM "entity_{0}_cr"
                    WHERE entity_id = :entity_id
                        AND record_id IN :record_ids
                )
                '''.format(session_id))
            upd_cr = text(''' 
                UPDATE "entity_{0}_cr" SET
                    target_record_id = :entity_id,
                    clustered = :clustered,
                    checked_out = :checked_out,
                    last_update = :last_update,
                    reviewer = :user_name
                WHERE record_id IN :record_ids
            '''.format(session_id))
            last_update = datetime.now().replace(tzinfo=TIME_ZONE)
            with engine.begin() as c:
                c.execute(upd,
                          entity_id=entity_id,
                          last_update=last_update,
                          user_name=user.name,
                          record_ids=match_ids,
                          clustered=True,
                          checked_out=False)
                c.execute(upd_cr,
                          entity_id=entity_id,
                          last_update=last_update,
                          user_name=user.name,
                          record_ids=match_ids,
                          clustered=True,
                          checked_out=False)
        if distinct_ids:
            distinct_ids = tuple([d for d in distinct_ids.split(',')])
            delete = text(''' 
                DELETE FROM "entity_{0}_cr"
                WHERE entity_id = :entity_id
                    AND record_id IN :record_ids
            '''.format(session_id))
            with engine.begin() as c:
                c.execute(delete, entity_id=entity_id, record_ids=distinct_ids)
        dedupe_session = db_session.query(DedupeSession).get(session_id)
        machine = loads(dedupe_session.review_machine)
        if distinct_ids:
            machine.label(entity_id, 0)
        else:
            machine.label(entity_id, 1)
        dedupe_session.review_machine = dumps(machine)
        dedupe_session.review_count = dedupe_session.review_count - 1
        db_session.add(dedupe_session)
        db_session.commit()
        resp = {
            'session_id': session_id,
            'entity_id': entity_id,
            'match_ids': match_ids,
            'distinct_ids': distinct_ids,
            'status': 'ok',
            'message': ''
        }
        status_code = 200
    resp = make_response(json.dumps(resp), status_code)
    resp.headers['Content-Type'] = 'application/json'
    return resp
示例#26
0
def match_review():  # pragma: no cover
    dedupe_session = db_session.query(DedupeSession).get(
        flask_session['session_id'])
    return render_template('dedupe_session/match-review.html',
                           session_id=flask_session['session_id'],
                           dedupe_session=dedupe_session)
示例#27
0
def mark_cluster():
    resp = {'status': 'ok', 'message': ''}
    status_code = 200
    session_id = flask_session['session_id']

    dedupe_session = db_session.query(DedupeSession).get(session_id)
    user = db_session.query(User).get(flask_session['api_key'])
    engine = db_session.bind
    entity_table = Table('entity_{0}'.format(session_id),
                         Base.metadata,
                         autoload=True,
                         autoload_with=engine)
    # TODO: Return an error if these args are not present.
    entity_id = request.args.get('entity_id')
    match_ids = request.args.get('match_ids')
    distinct_ids = request.args.get('distinct_ids')
    training_data = json.loads(dedupe_session.training_data)
    if match_ids:
        match_ids = tuple([int(m) for m in match_ids.split(',')])
        upd_vals = {
            'entity_id': entity_id,
            'record_ids': match_ids,
            'user_name': user.name,
            'clustered': True,
            'match_type': 'clerical review',
            'last_update': datetime.now().replace(tzinfo=TIME_ZONE),
            'match_ids': match_ids,
        }
        upd = text(''' 
            UPDATE "entity_{0}" SET
              entity_id = :entity_id,
              reviewer = :user_name,
              clustered = :clustered,
              match_type = :match_type,
              last_update = :last_update
            WHERE entity_id = :entity_id
              AND record_id IN :match_ids
        '''.format(session_id))
        with engine.begin() as conn:
            conn.execute(upd, **upd_vals)
        update_existing = text('''
            UPDATE "entity_{0}" SET 
                entity_id = :entity_id, 
                clustered = :clustered,
                reviewer = :user_name,
                match_type = :match_type,
                last_update = :last_update
                FROM (
                    SELECT e.record_id 
                        FROM "entity_{0}" AS e 
                        JOIN (
                            SELECT record_id 
                                FROM "entity_{0}"
                                WHERE entity_id = :entity_id
                                    AND record_id IN :record_ids
                        ) AS s 
                        ON e.target_record_id = s.record_id
                ) AS subq 
            WHERE "entity_{0}".record_id = subq.record_id
            '''.format(dedupe_session.id))
        with engine.begin() as c:
            c.execute(update_existing, **upd_vals)
        # training_data['match'].extend(pairs)
    if distinct_ids:
        distinct_ids = tuple([int(d) for d in distinct_ids.split(',')])
        delete = entity_table.delete()\
            .where(entity_table.c.entity_id == entity_id)\
            .where(entity_table.c.record_id.in_(distinct_ids))
        with engine.begin() as c:
            c.execute(delete)
        #training_data['distinct'].append(pairs)

    machine = loads(dedupe_session.review_machine)
    if distinct_ids:
        machine.label(entity_id, 0)
    else:
        machine.label(entity_id, 1)
    dedupe_session.review_machine = dumps(machine)
    dedupe_session.review_count = dedupe_session.review_count - 1
    db_session.add(dedupe_session)
    db_session.commit()
    resp = {
        'session_id': session_id,
        'entity_id': entity_id,
        'match_ids': match_ids,
        'distinct_ids': distinct_ids,
        'status': 'ok',
        'message': ''
    }
    status_code = 200
    resp = make_response(json.dumps(resp), status_code)
    resp.headers['Content-Type'] = 'application/json'
    return resp
示例#28
0
def about():  # pragma: no cover
    user_id = flask_session.get('user_id')
    user = None
    if user_id:
        user = db_session.query(User).get(flask_session['user_id'])
    return render_template("about.html", user=user)
示例#29
0
def match():
    try:
        post = json.loads(request.data)
    except ValueError:
        r = {
            'status': 'error',
            'message': ''' 
                The content of your request should be a 
                string encoded JSON object.
            ''',
            'object': request.data,
        }
        resp = make_response(json.dumps(r), 400)
        resp.headers['Content-Type'] = 'application/json'
        return resp
    r, status_code, sess = validate_post(post)
    if r['status'] != 'error':
        api_key = post['api_key']
        session_id = post['session_id']
        n_matches = post.get('num_matches', 5)
        obj = post['object']
        field_defs = json.loads(sess.field_defs)
        model_fields = sorted(list(set([f['field'] for f in field_defs])))
        fields = ', '.join(['r.{0}'.format(f) for f in model_fields])
        engine = db_session.bind
        entity_table = Table('entity_{0}'.format(session_id), Base.metadata, 
            autoload=True, autoload_with=engine, keep_existing=True)
        try:
            hash_me = []
            for field in model_fields:
                if obj[field]:
                    hash_me.append(unicode(obj[field]))
                else:
                    hash_me.append('')
            hash_me = ';'.join(hash_me)
        except KeyError, e:
            r['status'] = 'error'
            r['message'] = 'Sent fields "{0}" do no match model fields "{1}"'\
                .format(','.join(obj.keys()), ','.join(model_fields))
            resp = make_response(json.dumps(r), 400)
            resp.headers['Content-Type'] = 'application/json'
            return resp
        if set(obj.keys()).isdisjoint(set(model_fields)):
            r['status'] = 'error'
            r['message'] = 'Sent fields "{0}" do no match model fields "{1}"'\
                .format(','.join(obj.keys()), ','.join(model_fields))
            resp = make_response(json.dumps(r), 400)
            resp.headers['Content-Type'] = 'application/json'
            return resp
        md5_hash = md5(unidecode(hash_me)).hexdigest()
        exact_match = db_session.query(entity_table)\
            .filter(entity_table.c.source_hash == md5_hash).first()
        match_list = []
        if exact_match:
            sel = text(''' 
                  SELECT {0} 
                  FROM "raw_{1}" AS r
                  JOIN "entity_{1}" AS e
                    ON r.record_id = e.record_id
                  WHERE e.entity_id = :entity_id
                  LIMIT :limit
                '''.format(fields, session_id))
            rows = []
            with engine.begin() as conn:
                rows = list(conn.execute(sel, 
                    entity_id=exact_match.entity_id, limit=n_matches))
            for row in rows:
                d = {f: getattr(row, f) for f in model_fields}
                d['entity_id'] = exact_match.entity_id
                d['match_confidence'] = '1.0'
                match_list.append(d)
        else:
            deduper = dedupe.StaticGazetteer(StringIO(sess.gaz_settings_file))
            for k,v in obj.items():
                obj[k] = preProcess(unicode(v))
            block_keys = tuple([b[0] for b in list(deduper.blocker([('blob', obj)]))])
            
            # Sometimes the blocker does not find blocks. In this case we can't match
            if block_keys:
                sel = text('''
                      SELECT r.record_id, {1}
                      FROM "processed_{0}" as r
                      JOIN (
                        SELECT record_id
                        FROM "match_blocks_{0}"
                        WHERE block_key IN :block_keys
                      ) AS s
                      ON r.record_id = s.record_id
                    '''.format(session_id, fields))
                with engine.begin() as conn:
                    data_d = {int(i[0]): dict(zip(model_fields, i[1:])) \
                        for i in list(conn.execute(sel, block_keys=block_keys))}
                if data_d:
                    deduper.index(data_d)
                    linked = deduper.match({'blob': obj}, threshold=0, n_matches=n_matches)
                    if linked:
                        ids = []
                        confs = {}
                        for l in linked[0]:
                            id_set, confidence = l
                            ids.extend([i for i in id_set if i != 'blob'])
                            confs[id_set[1]] = confidence
                        ids = tuple(set(ids))
                        sel = text(''' 
                              SELECT {0}, r.record_id, e.entity_id
                              FROM "raw_{1}" as r
                              JOIN "entity_{1}" as e
                                ON r.record_id = e.record_id
                              WHERE r.record_id IN :ids
                            '''.format(fields, session_id))
                        matches = []
                        with engine.begin() as conn:
                            matches = list(conn.execute(sel, ids=ids))
                        for match in matches:
                            m = {f: getattr(match, f) for f in model_fields}
                            m['record_id'] = getattr(match, 'record_id')
                            m['entity_id'] = getattr(match, 'entity_id')
                            # m['match_confidence'] = float(confs[str(m['entity_id'])])
                            match_list.append(m)
            else:
                if sentry:
                    sentry.captureMessage('Unable to block record', extra=post)
        r['matches'] = match_list
示例#30
0
def add_entity():
    ''' 
    Add an entry to the entity map. 
    POST data should be a string encoded JSON object which looks like:
    
    {
        "object": {
            "city":"Macon",
            "cont_name":"Kinght & Fisher, LLP",
            "zip":"31201",
            "firstname":null,
            "employer":null,
            "address":"350 Second St",
            "record_id":3,
            "type":"Monetary",
            "occupation":null
        },
        "api_key":"6bf73c41-404e-47ae-bc2d-051e935c298e",
        "match_id": 100,
    }

    The object key should contain a mapping of fields that are in the data
    model. If the record_id field is present, an attempt will be made to look
    up the record in the raw / processed table before making the entry. If
    match_id is present, the record will be added as a member of the entity
    referenced by the id.
    '''
    r = {
        'status': 'ok',
        'message': ""
    }
    status_code = 200
    session_id = flask_session['session_id']

    try:
        post = json.loads(request.data)
    except ValueError:
        r = {
            'status': 'error',
            'message': ''' 
                The content of your request should be a 
                string encoded JSON object.
            ''',
            'object': request.data,
        }
        resp = make_response(json.dumps(r), 400)
        resp.headers['Content-Type'] = 'application/json'
        return resp
    obj = post['object']
    record_id = obj.get('record_id')
    if record_id:
        del obj['record_id']
    match_id = json.loads(request.data).get('match_id')
    sess = db_session.query(DedupeSession).get(session_id)
    field_defs = json.loads(sess.field_defs)
    fds = {}
    for fd in field_defs:
        try:
            fds[fd['field']].append(fd['type'])
        except KeyError:
            fds[fd['field']] = [fd['type']]
    if not set(fds.keys()) == set(obj.keys()):
        r['status'] = 'error'
        r['message'] = "The fields in the object do not match the fields in the model"
        status_code = 400
    else:
        engine = db_session.bind
        proc_table = Table('processed_{0}'.format(session_id), Base.metadata, 
            autoload=True, autoload_with=engine, keep_existing=True)
        row = db_session.query(proc_table)\
            .filter(proc_table.c.record_id == record_id)\
            .first()
        if not row: # pragma: no cover
            raw_table = Table('raw_{0}'.format(session_id), Base.metadata, 
                autoload=True, autoload_with=engine, keep_existing=True)
            proc_ins = 'INSERT INTO "processed_{0}" (SELECT record_id, '\
                .format(proc_table_name)
            for idx, field in enumerate(fds.keys()):
                try:
                    field_types = fds[field]
                except KeyError:
                    field_types = ['String']
                # TODO: Need to figure out how to parse a LatLong field type
                if 'Price' in field_types:
                    col_def = 'COALESCE(CAST("{0}" AS DOUBLE PRECISION), 0.0) AS {0}'.format(field)
                else:
                    col_def = 'CAST(TRIM(COALESCE(LOWER("{0}"), \'\')) AS VARCHAR) AS {0}'.format(field)
                if idx < len(fds.keys()) - 1:
                    proc_ins += '{0}, '.format(col_def)
                else:
                    proc_ins += '{0} '.format(col_def)
            else:
                proc_ins += 'FROM "raw_{0}" WHERE record_id = :record_id)'\
                    .format(session_id)

            with engine.begin() as conn:
                record_id = conn.execute(raw_table.insert()\
                    .returning(raw_table.c.record_id) , **obj)
                conn.execute(text(proc_ins), record_id=record_id)
        hash_me = ';'.join([preProcess(unicode(obj[i])) for i in fds.keys()])
        md5_hash = md5(unidecode(hash_me)).hexdigest()
        entity = {
            'entity_id': unicode(uuid4()),
            'record_id': record_id,
            'source_hash': md5_hash,
            'clustered': True,
            'checked_out': False,
        }
        entity_table = Table('entity_{0}'.format(session_id), Base.metadata, 
            autoload=True, autoload_with=engine, keep_existing=True)
        if match_id:
            entity['target_record_id'] = match_id
            entity_id = db_session.query(entity_table.c.entity_id)\
                .filter(entity_table.c.record_id == match_id)\
                .first()
            entity['entity_id'] = entity_id.entity_id
        with engine.begin() as conn:
            conn.execute(entity_table.insert(), **entity)
        deduper = dedupe.StaticGazetteer(StringIO(sess.gaz_settings_file))
        for k,v in obj.items():
            obj[k] = preProcess(unicode(v))
        block_keys = [{'record_id': b[1], 'block_key': b[0]} \
                for b in list(deduper.blocker([(record_id, obj)]))]
        with engine.begin() as conn:
            conn.execute(text(''' 
                INSERT INTO "match_blocks_{0}" (
                    block_key,
                    record_id
                ) VALUES (:block_key, :record_id)
            '''.format(sess.id)), *block_keys)
    if sess.review_count:
        sess.review_count = sess.review_count - 1
        db_session.add(sess)
        db_session.commit()
    resp = make_response(json.dumps(r), status_code)
    resp.headers['Content-Type'] = 'application/json'
    return resp