def decorated(*args, **kwargs): api_key = None resp = { 'status': 'ok', 'message': '' } status_code = 200 if flask_session.get('user_id'): api_key = flask_session['user_id'] elif request.form.get('api_key'): api_key = request.form['api_key'] elif request.args.get('api_key'): api_key = request.args['api_key'] else: try: api_key = json.loads(request.data).get('api_key') except ValueError: api_key = None if not api_key: resp['status'] = 'error' resp['message'] = "'api_key' is a required parameter" status_code = 401 response = make_response(json.dumps(resp), status_code) response.headers['Content-Type'] = 'application/json' return response else: user = db_session.query(User).get(api_key) sess = db_session.query(DedupeSession)\ .filter(DedupeSession.group.has( Group.id.in_([i.id for i in user.groups])))\ .all() flask_session['user_sessions'] = [s.id for s in sess] flask_session['api_key'] = api_key dedupe_session = None if request.args.get('session_id'): session_id = request.args['session_id'] flask_session['session_id'] = request.args['session_id'] elif flask_session.get('session_id'): session_id = flask_session['session_id'] else: flash("Sorry, could not find a session id") return redirect(url_for('admin.index')) if flask_session['session_id'] not in flask_session['user_sessions']: flash("Sorry, you don't have access to that session") return redirect(url_for('admin.index')) return f(*args, **kwargs)
def delete_session(): session_id = flask_session['session_id'] data = db_session.query(DedupeSession).get(session_id) db_session.delete(data) db_session.commit() tables = [ 'entity_{0}', 'entity_{0}_cr', 'raw_{0}', 'processed_{0}', 'processed_{0}_cr', 'block_{0}', 'block_{0}_cr', 'plural_block_{0}', 'plural_block_{0}_cr', 'cr_{0}', 'covered_{0}', 'covered_{0}_cr', 'plural_key_{0}', 'plural_key_{0}_cr', 'small_cov_{0}', 'small_cov_{0}_cr', 'canon_{0}', 'exact_match_{0}', 'match_blocks_{0}', ] cleanupTables.delay(session_id, tables=tables) resp = make_response(json.dumps({ 'session_id': session_id, 'status': 'ok' })) resp.headers['Content-Type'] = 'application/json' return resp
def mark_pair(): action = request.args['action'] flask_session['last_interaction'] = datetime.now() counter = flask_session.get('counter') sess = db_session.query(DedupeSession).get(flask_session['session_id']) deduper = flask_session['deduper'] # Attempt to cast the training input appropriately # TODO: Figure out LatLong type field_defs = json.loads(sess.field_defs) fds = {} for fd in field_defs: try: fds[fd['field']].append(fd['type']) except KeyError: fds[fd['field']] = [fd['type']] current_pair = flask_session['current_pair'] left, right = current_pair l_d = {} r_d = {} for k, v in left.items(): if 'Price' in fds[k]: l_d[k] = float(v) else: l_d[k] = v for k, v in right.items(): if 'Price' in fds[k]: r_d[k] = float(v) else: r_d[k] = v current_pair = [l_d, r_d] if sess.training_data: labels = json.loads(sess.training_data) else: labels = {'distinct': [], 'match': []} if action == 'yes': labels['match'].append(current_pair) counter['yes'] += 1 resp = {'counter': counter} elif action == 'no': labels['distinct'].append(current_pair) counter['no'] += 1 resp = {'counter': counter} elif action == 'finish': dedupeRaw.delay(flask_session['session_id']) resp = {'finished': True} flask_session['dedupe_start'] = time.time() else: counter['unsure'] += 1 flask_session['counter'] = counter resp = {'counter': counter} sess.training_data = json.dumps(labels, default=_to_json) db_session.add(sess) db_session.commit() deduper.markPairs(labels) if resp.get('finished'): del flask_session['deduper'] resp = make_response(json.dumps(resp)) resp.headers['Content-Type'] = 'application/json' return resp
def upload(): session_id = unicode(uuid4()) f = request.files['input_file'] flask_session['session_name'] = f.filename file_type = f.filename.rsplit('.')[1] u = StringIO(f.read()) u.seek(0) if file_type != 'csv': # pragma: no cover file_format = convert.guess_format(flask_session['session_name']) u = StringIO(convert.convert(u, file_format)) fieldnames = [ slugify(unicode(i)) for i in u.next().strip('\r\n').split(',') ] flask_session['fieldnames'] = fieldnames user_id = flask_session['user_id'] user = db_session.query(User).get(user_id) group = user.groups[0] sess = DedupeSession(id=session_id, name=request.form.get('name'), description=request.form.get('description'), filename=f.filename, group=group, status=STATUS_LIST[0]['machine_name']) db_session.add(sess) db_session.commit() u.seek(0) with open('/tmp/%s_raw.csv' % session_id, 'wb') as s: s.write(u.getvalue()) del u initializeSession.delay(session_id) flask_session['session_id'] = session_id return jsonify(ready=True, session_id=session_id)
def get_unmatched(): resp = { 'status': 'ok', 'message': '', 'object': {}, 'remaining': 0, } status_code = 200 session_id = flask_session['session_id'] dedupe_session = db_session.query(DedupeSession).get(session_id) resp['remaining'] = dedupe_session.review_count raw_fields = list(set([f['field'] for f in json.loads(dedupe_session.field_defs)])) raw_fields.append('record_id') fields = ', '.join(['r.{0}'.format(f) for f in raw_fields]) sel = ''' SELECT {0} FROM "raw_{1}" as r LEFT JOIN "entity_{1}" as e ON r.record_id = e.record_id WHERE e.record_id IS NULL LIMIT 1 '''.format(fields, session_id) engine = db_session.bind with engine.begin() as conn: rows = [dict(zip(raw_fields, r)) for r in conn.execute(sel)] if not rows: dedupe_session.status = 'canonical' db_session.add(dedupe_session) db_session.commit() else: resp['object'] = rows[0] response = make_response(json.dumps(resp), status_code) response.headers['Content-Type'] = 'application/json' return response
def get_canon_cluster(): resp = { 'status': 'ok', 'message': '', 'objects': [], } status_code = 200 session_id = flask_session['session_id'] checkinSessions() dedupe_session = db_session.query(DedupeSession).get(session_id) entity_id, cluster, prediction = getCluster(session_id, 'entity_{0}_cr', 'cr_{0}') if cluster: resp['entity_id'] = entity_id resp['objects'] = cluster resp['prediction'] = prediction else: getMatchingReady.delay(session_id) resp['total_clusters'] = dedupe_session.entity_count resp['review_remainder'] = dedupe_session.review_count response = make_response(json.dumps(resp), status_code) response.headers['Content-Type'] = 'application/json' return response
def field_definitions(): session_id = flask_session['session_id'] data = db_session.query(DedupeSession).get(session_id) field_defs = data.field_defs resp = make_response(field_defs, 200) resp.headers['Content-Type'] = 'application/json' return resp
def settings_file(): session_id = flask_session['session_id'] data = db_session.query(DedupeSession).get(session_id) settings_file = data.settings_file resp = make_response(settings_file, 200) resp.headers[ 'Content-Disposition'] = 'attachment; filename=%s.dedupe_settings' % data.id return resp
def validate(self): rv = Form.validate(self) if not rv: return False existing_name = db_session.query(User)\ .filter(User.name == self.name.data).first() if existing_name: self.name.errors.append('Name is already registered') return False existing_email = db_session.query(User)\ .filter(User.email == self.email.data).first() if existing_email: self.email.errors.append('Email address is already registered') return False return True
def clear_error(): work_id = request.args['work_id'] work = db_session.query(WorkTable).get(work_id) work.cleared = True db_session.add(work) db_session.commit() response = make_response(json.dumps({'status': 'ok'})) response.headers['Content-Type'] = 'application/json' return response
def getDistinct(field_name, session_id): engine = app_session.bind metadata = MetaData() table = Table('processed_%s' % session_id, metadata, autoload=True, autoload_with=engine) col = getattr(table.c, field_name) q = app_session.query(distinct(col)).filter(and_(col != None, col != '')) distinct_values = list(set([unicode(v[0]) for v in q.all()])) return distinct_values
def mark_all_canon_cluster(): resp = {} status_code = 200 session_id = flask_session['session_id'] user = db_session.query(User).get(flask_session['api_key']) bulkMarkCanonClusters.delay(session_id, user=user.name) resp = make_response(json.dumps(resp), status_code) resp.headers['Content-Type'] = 'application/json' return resp
def select_field_types(): dedupe_session = db_session.query(DedupeSession).get( flask_session['session_id']) errors = db_session.query(WorkTable)\ .filter(WorkTable.session_id == dedupe_session.id)\ .filter(WorkTable.cleared == False)\ .all() errors = [e.value for e in errors] field_list = flask_session['field_list'] if request.method == 'POST': field_defs = [] form = {} for k in request.form.keys(): if k != 'csrf_token': form[k] = request.form.getlist(k) ftypes = sorted(form.items()) for k, g in groupby(ftypes, key=lambda x: x[0].rsplit('_', 1)[0]): vals = list(g) has_missing = False for ftype, val in vals: if ftype == '{0}_missing'.format(k): has_missing = True fs = [] for field, val in vals: fs.extend([{'field': k, 'type': val[i]} \ for i in range(len(val)) if field.endswith('type')]) for f in fs: if has_missing: f.update({'has_missing': True}) field_defs.extend(fs) dedupe_session = db_session.query(DedupeSession).get( flask_session['session_id']) dedupe_session.field_defs = json.dumps(field_defs) dedupe_session.status = 'model defined' db_session.add(dedupe_session) db_session.commit() if not errors: initializeModel.delay(dedupe_session.id) return redirect(url_for('trainer.training_run')) return render_template('dedupe_session/select_field_types.html', field_list=field_list, dedupe_session=dedupe_session, errors=errors)
def training_data(): session_id = flask_session['session_id'] data = db_session.query(DedupeSession).get(session_id) training_data = data.training_data resp = make_response(training_data, 200) resp.headers['Content-Type'] = 'text/plain' resp.headers[ 'Content-Disposition'] = 'attachment; filename=%s_training.json' % data.id return resp
def mark_all_clusters(): resp = {'status': 'ok', 'message': ''} status_code = 200 session_id = flask_session['session_id'] # Need to update existing clusters with new entity_id here, too. user = db_session.query(User).get(flask_session['api_key']) bulkMarkClusters.delay(session_id, user=user.name) response = make_response(json.dumps(resp), status_code) response.headers['Content-Type'] = 'application/json' return response
def session_review(): first_review = True if request.args.get('second_review'): first_review = False dedupe_session = db_session.query(DedupeSession).get( flask_session['session_id']) return render_template('dedupe_session/session-review.html', session_id=flask_session['session_id'], first_review=first_review, dedupe_session=dedupe_session)
def getCluster(session_id, entity_pattern, raw_pattern): ent_name = entity_pattern.format(session_id) raw_name = raw_pattern.format(session_id) sess = app_session.query(DedupeSession).get(session_id) app_session.refresh(sess) cluster_list = [] prediction = None machine = cPickle.loads(sess.review_machine) entity_id = machine.get_next() sess.review_machine = cPickle.dumps(machine) app_session.add(sess) app_session.commit() engine = app_session.bind model_fields = list(set([f['field'] for f in json.loads(sess.field_defs)])) raw_cols = ', '.join(['r.{0}'.format(f) for f in model_fields]) sel = text(''' SELECT e.confidence, {0}, r.record_id FROM "{1}" AS r JOIN "{2}" as e ON r.record_id = e.record_id WHERE e.entity_id = :entity_id ORDER BY e.confidence '''.format(raw_cols, raw_name, ent_name)) records = list(engine.execute(sel, entity_id=entity_id)) if records: raw_fields = ['confidence'] + model_fields + ['record_id'] max_confidence = max([r['confidence'] for r in records]) cluster_length = len(records) prediction = machine.predict([max_confidence, cluster_length]) for thing in records: d = {} for k,v in zip(raw_fields, thing): d[k] = v # d['confidence'] = formatPercentage(d['confidence']) cluster_list.append(d) one_minute = datetime.now() + timedelta(minutes=1) upd = text(''' UPDATE "{0}" SET checked_out = TRUE, checkout_expire = :one_minute WHERE entity_id = :entity_id '''.format(ent_name)) with engine.begin() as c: c.execute(upd, entity_id=entity_id, one_minute=one_minute) return entity_id, cluster_list, prediction else: return None, None, None
def decorated(*args, **kwargs): user_id = flask_session.get('user_id') if not user_id: return redirect(url_for('auth.login')) user = db_session.query(User).get(user_id) user_roles = set([r.name for r in user.roles]) rs = set(roles) if user_roles.issubset(rs): return f(*args, **kwargs) else: flash('Sorry, you don\'t have access to that page') return redirect(url_for('admin.index'))
def select_fields(): status_code = 200 errors = [] dedupe_session = db_session.query(DedupeSession).get( flask_session['session_id']) fields = flask_session.get('fieldnames') # If the fields are not in the session, that means that the user has come # here directly from the home page. We'll try to load them from the raw # table in the database but if that does not exist yet (which is possible) # then we'll redirect them to the home page. if not fields: meta = MetaData() engine = db_session.bind try: raw = Table('raw_{0}'.format(flask_session['session_id']), meta, autoload=True, autoload_with=engine, keep_existing=True) fields = [r for r in raw.columns.keys() if r != 'record_id'] flask_session['fieldnames'] = fields except NoSuchTableError: return redirect(url_for('admin.index')) errors = db_session.query(WorkTable)\ .filter(WorkTable.session_id == dedupe_session.id)\ .filter(WorkTable.cleared == False)\ .all() if request.method == 'POST': field_list = [r for r in request.form if r != 'csrf_token'] flask_session['field_list'] = field_list if field_list: return redirect(url_for('trainer.select_field_types')) else: errors = ['You must select at least one field to compare on.'] status_code = 400 return render_template('dedupe_session/select_fields.html', errors=errors, fields=fields, dedupe_session=dedupe_session)
def training_run(): dedupe_session = db_session.query(DedupeSession).get( flask_session['session_id']) if dedupe_session.training_data: td = json.loads(dedupe_session.training_data) flask_session['counter'] = { 'yes': len(td['match']), 'no': len(td['distinct']), 'unsure': 0 } else: flask_session['counter'] = { 'yes': 0, 'no': 0, 'unsure': 0, } errors = db_session.query(WorkTable)\ .filter(WorkTable.session_id == dedupe_session.id)\ .filter(WorkTable.cleared == False)\ .all() if not errors: status_code = 200 field_defs = json.loads(dedupe_session.field_defs) if dedupe_session.sample: sample = cPickle.loads(dedupe_session.sample) deduper = dedupe.Dedupe(field_defs, data_sample=sample) flask_session['deduper'] = deduper else: status_code = 500 time.sleep(1) db_session.refresh(dedupe_session) return make_response( render_template('dedupe_session/training_run.html', errors=errors, dedupe_session=dedupe_session), status_code)
def checkinSessions(): now = datetime.now() all_sessions = [i.id for i in app_session.query(DedupeSession.id).all()] engine = init_engine(current_app.config['DB_CONN']) for sess_id in all_sessions: try: table = Table('entity_%s' % sess_id, Base.metadata, autoload=True, autoload_with=engine) upd = table.update().where(table.c.checkout_expire <= now)\ .where(table.c.clustered == False)\ .values(checked_out = False, checkout_expire = None) with engine.begin() as c: c.execute(upd) except NoSuchTableError: # pragma: no cover pass return None
def validate(self): rv = Form.validate(self) if not rv: return False user = db_session.query(User)\ .filter(func.lower(User.email) == func.lower(self.email.data))\ .first() if user is None: self.email.errors.append('Email address is not registered') return False if not user.check_password(user.name, self.password.data): self.password.errors.append('Password is not valid') return False self.user = user return True
def validate_post(post): session_id = post.get('session_id') obj = post.get('object') r = {'status': 'ok', 'message': '', 'object': obj} status_code = 200 sess = db_session.query(DedupeSession).get(session_id) if not session_id: r['status'] = 'error' r['message'] = 'Session ID is required' status_code = 401 elif not obj: r['status'] = 'error' r['message'] = 'Match object is required' status_code = 400 elif not sess: r['status'] = 'error' r['message'] = 'Invalid Session ID' status_code = 400 return r, status_code, sess
def delete_data_model(): session_id = flask_session['session_id'] dedupe_session = db_session.query(DedupeSession).get(session_id) dedupe_session.field_defs = None dedupe_session.training_data = None dedupe_session.sample = None dedupe_session.status = 'dataset uploaded' db_session.add(dedupe_session) db_session.commit() tables = [ 'entity_{0}', 'block_{0}', 'plural_block_{0}', 'covered_{0}', 'plural_key_{0}', 'small_cov_{0}', ] engine = db_session.bind for table in tables: # pragma: no cover try: data_table = Table(table.format(session_id), Base.metadata, autoload=True, autoload_with=engine) data_table.drop(engine) except NoSuchTableError: pass except ProgrammingError: pass resp = { 'status': 'ok', 'message': 'Data model for session {0} deleted'.format(session_id) } status_code = 200 resp = make_response(json.dumps(resp), status_code) resp.headers['Content-Type'] = 'application/json' return resp
def mark_canon_cluster(): session_id = flask_session['session_id'] if not request.args.get('entity_id'): resp = { 'status': 'error', 'message': '"entity_id" is a required parameter' } status_code = 400 else: entity_id = request.args.get('entity_id') match_ids = request.args.get('match_ids') distinct_ids = request.args.get('distinct_ids') user = db_session.query(User).get(flask_session['api_key']) engine = db_session.bind if match_ids: match_ids = tuple([d for d in match_ids.split(',')]) upd = text(''' UPDATE "entity_{0}" SET entity_id = :entity_id, clustered = :clustered, checked_out = :checked_out, last_update = :last_update, reviewer = :user_name WHERE entity_id in ( SELECT record_id FROM "entity_{0}_cr" WHERE entity_id = :entity_id AND record_id IN :record_ids ) '''.format(session_id)) upd_cr = text(''' UPDATE "entity_{0}_cr" SET target_record_id = :entity_id, clustered = :clustered, checked_out = :checked_out, last_update = :last_update, reviewer = :user_name WHERE record_id IN :record_ids '''.format(session_id)) last_update = datetime.now().replace(tzinfo=TIME_ZONE) with engine.begin() as c: c.execute(upd, entity_id=entity_id, last_update=last_update, user_name=user.name, record_ids=match_ids, clustered=True, checked_out=False) c.execute(upd_cr, entity_id=entity_id, last_update=last_update, user_name=user.name, record_ids=match_ids, clustered=True, checked_out=False) if distinct_ids: distinct_ids = tuple([d for d in distinct_ids.split(',')]) delete = text(''' DELETE FROM "entity_{0}_cr" WHERE entity_id = :entity_id AND record_id IN :record_ids '''.format(session_id)) with engine.begin() as c: c.execute(delete, entity_id=entity_id, record_ids=distinct_ids) dedupe_session = db_session.query(DedupeSession).get(session_id) machine = loads(dedupe_session.review_machine) if distinct_ids: machine.label(entity_id, 0) else: machine.label(entity_id, 1) dedupe_session.review_machine = dumps(machine) dedupe_session.review_count = dedupe_session.review_count - 1 db_session.add(dedupe_session) db_session.commit() resp = { 'session_id': session_id, 'entity_id': entity_id, 'match_ids': match_ids, 'distinct_ids': distinct_ids, 'status': 'ok', 'message': '' } status_code = 200 resp = make_response(json.dumps(resp), status_code) resp.headers['Content-Type'] = 'application/json' return resp
def match_review(): # pragma: no cover dedupe_session = db_session.query(DedupeSession).get( flask_session['session_id']) return render_template('dedupe_session/match-review.html', session_id=flask_session['session_id'], dedupe_session=dedupe_session)
def mark_cluster(): resp = {'status': 'ok', 'message': ''} status_code = 200 session_id = flask_session['session_id'] dedupe_session = db_session.query(DedupeSession).get(session_id) user = db_session.query(User).get(flask_session['api_key']) engine = db_session.bind entity_table = Table('entity_{0}'.format(session_id), Base.metadata, autoload=True, autoload_with=engine) # TODO: Return an error if these args are not present. entity_id = request.args.get('entity_id') match_ids = request.args.get('match_ids') distinct_ids = request.args.get('distinct_ids') training_data = json.loads(dedupe_session.training_data) if match_ids: match_ids = tuple([int(m) for m in match_ids.split(',')]) upd_vals = { 'entity_id': entity_id, 'record_ids': match_ids, 'user_name': user.name, 'clustered': True, 'match_type': 'clerical review', 'last_update': datetime.now().replace(tzinfo=TIME_ZONE), 'match_ids': match_ids, } upd = text(''' UPDATE "entity_{0}" SET entity_id = :entity_id, reviewer = :user_name, clustered = :clustered, match_type = :match_type, last_update = :last_update WHERE entity_id = :entity_id AND record_id IN :match_ids '''.format(session_id)) with engine.begin() as conn: conn.execute(upd, **upd_vals) update_existing = text(''' UPDATE "entity_{0}" SET entity_id = :entity_id, clustered = :clustered, reviewer = :user_name, match_type = :match_type, last_update = :last_update FROM ( SELECT e.record_id FROM "entity_{0}" AS e JOIN ( SELECT record_id FROM "entity_{0}" WHERE entity_id = :entity_id AND record_id IN :record_ids ) AS s ON e.target_record_id = s.record_id ) AS subq WHERE "entity_{0}".record_id = subq.record_id '''.format(dedupe_session.id)) with engine.begin() as c: c.execute(update_existing, **upd_vals) # training_data['match'].extend(pairs) if distinct_ids: distinct_ids = tuple([int(d) for d in distinct_ids.split(',')]) delete = entity_table.delete()\ .where(entity_table.c.entity_id == entity_id)\ .where(entity_table.c.record_id.in_(distinct_ids)) with engine.begin() as c: c.execute(delete) #training_data['distinct'].append(pairs) machine = loads(dedupe_session.review_machine) if distinct_ids: machine.label(entity_id, 0) else: machine.label(entity_id, 1) dedupe_session.review_machine = dumps(machine) dedupe_session.review_count = dedupe_session.review_count - 1 db_session.add(dedupe_session) db_session.commit() resp = { 'session_id': session_id, 'entity_id': entity_id, 'match_ids': match_ids, 'distinct_ids': distinct_ids, 'status': 'ok', 'message': '' } status_code = 200 resp = make_response(json.dumps(resp), status_code) resp.headers['Content-Type'] = 'application/json' return resp
def about(): # pragma: no cover user_id = flask_session.get('user_id') user = None if user_id: user = db_session.query(User).get(flask_session['user_id']) return render_template("about.html", user=user)
def match(): try: post = json.loads(request.data) except ValueError: r = { 'status': 'error', 'message': ''' The content of your request should be a string encoded JSON object. ''', 'object': request.data, } resp = make_response(json.dumps(r), 400) resp.headers['Content-Type'] = 'application/json' return resp r, status_code, sess = validate_post(post) if r['status'] != 'error': api_key = post['api_key'] session_id = post['session_id'] n_matches = post.get('num_matches', 5) obj = post['object'] field_defs = json.loads(sess.field_defs) model_fields = sorted(list(set([f['field'] for f in field_defs]))) fields = ', '.join(['r.{0}'.format(f) for f in model_fields]) engine = db_session.bind entity_table = Table('entity_{0}'.format(session_id), Base.metadata, autoload=True, autoload_with=engine, keep_existing=True) try: hash_me = [] for field in model_fields: if obj[field]: hash_me.append(unicode(obj[field])) else: hash_me.append('') hash_me = ';'.join(hash_me) except KeyError, e: r['status'] = 'error' r['message'] = 'Sent fields "{0}" do no match model fields "{1}"'\ .format(','.join(obj.keys()), ','.join(model_fields)) resp = make_response(json.dumps(r), 400) resp.headers['Content-Type'] = 'application/json' return resp if set(obj.keys()).isdisjoint(set(model_fields)): r['status'] = 'error' r['message'] = 'Sent fields "{0}" do no match model fields "{1}"'\ .format(','.join(obj.keys()), ','.join(model_fields)) resp = make_response(json.dumps(r), 400) resp.headers['Content-Type'] = 'application/json' return resp md5_hash = md5(unidecode(hash_me)).hexdigest() exact_match = db_session.query(entity_table)\ .filter(entity_table.c.source_hash == md5_hash).first() match_list = [] if exact_match: sel = text(''' SELECT {0} FROM "raw_{1}" AS r JOIN "entity_{1}" AS e ON r.record_id = e.record_id WHERE e.entity_id = :entity_id LIMIT :limit '''.format(fields, session_id)) rows = [] with engine.begin() as conn: rows = list(conn.execute(sel, entity_id=exact_match.entity_id, limit=n_matches)) for row in rows: d = {f: getattr(row, f) for f in model_fields} d['entity_id'] = exact_match.entity_id d['match_confidence'] = '1.0' match_list.append(d) else: deduper = dedupe.StaticGazetteer(StringIO(sess.gaz_settings_file)) for k,v in obj.items(): obj[k] = preProcess(unicode(v)) block_keys = tuple([b[0] for b in list(deduper.blocker([('blob', obj)]))]) # Sometimes the blocker does not find blocks. In this case we can't match if block_keys: sel = text(''' SELECT r.record_id, {1} FROM "processed_{0}" as r JOIN ( SELECT record_id FROM "match_blocks_{0}" WHERE block_key IN :block_keys ) AS s ON r.record_id = s.record_id '''.format(session_id, fields)) with engine.begin() as conn: data_d = {int(i[0]): dict(zip(model_fields, i[1:])) \ for i in list(conn.execute(sel, block_keys=block_keys))} if data_d: deduper.index(data_d) linked = deduper.match({'blob': obj}, threshold=0, n_matches=n_matches) if linked: ids = [] confs = {} for l in linked[0]: id_set, confidence = l ids.extend([i for i in id_set if i != 'blob']) confs[id_set[1]] = confidence ids = tuple(set(ids)) sel = text(''' SELECT {0}, r.record_id, e.entity_id FROM "raw_{1}" as r JOIN "entity_{1}" as e ON r.record_id = e.record_id WHERE r.record_id IN :ids '''.format(fields, session_id)) matches = [] with engine.begin() as conn: matches = list(conn.execute(sel, ids=ids)) for match in matches: m = {f: getattr(match, f) for f in model_fields} m['record_id'] = getattr(match, 'record_id') m['entity_id'] = getattr(match, 'entity_id') # m['match_confidence'] = float(confs[str(m['entity_id'])]) match_list.append(m) else: if sentry: sentry.captureMessage('Unable to block record', extra=post) r['matches'] = match_list
def add_entity(): ''' Add an entry to the entity map. POST data should be a string encoded JSON object which looks like: { "object": { "city":"Macon", "cont_name":"Kinght & Fisher, LLP", "zip":"31201", "firstname":null, "employer":null, "address":"350 Second St", "record_id":3, "type":"Monetary", "occupation":null }, "api_key":"6bf73c41-404e-47ae-bc2d-051e935c298e", "match_id": 100, } The object key should contain a mapping of fields that are in the data model. If the record_id field is present, an attempt will be made to look up the record in the raw / processed table before making the entry. If match_id is present, the record will be added as a member of the entity referenced by the id. ''' r = { 'status': 'ok', 'message': "" } status_code = 200 session_id = flask_session['session_id'] try: post = json.loads(request.data) except ValueError: r = { 'status': 'error', 'message': ''' The content of your request should be a string encoded JSON object. ''', 'object': request.data, } resp = make_response(json.dumps(r), 400) resp.headers['Content-Type'] = 'application/json' return resp obj = post['object'] record_id = obj.get('record_id') if record_id: del obj['record_id'] match_id = json.loads(request.data).get('match_id') sess = db_session.query(DedupeSession).get(session_id) field_defs = json.loads(sess.field_defs) fds = {} for fd in field_defs: try: fds[fd['field']].append(fd['type']) except KeyError: fds[fd['field']] = [fd['type']] if not set(fds.keys()) == set(obj.keys()): r['status'] = 'error' r['message'] = "The fields in the object do not match the fields in the model" status_code = 400 else: engine = db_session.bind proc_table = Table('processed_{0}'.format(session_id), Base.metadata, autoload=True, autoload_with=engine, keep_existing=True) row = db_session.query(proc_table)\ .filter(proc_table.c.record_id == record_id)\ .first() if not row: # pragma: no cover raw_table = Table('raw_{0}'.format(session_id), Base.metadata, autoload=True, autoload_with=engine, keep_existing=True) proc_ins = 'INSERT INTO "processed_{0}" (SELECT record_id, '\ .format(proc_table_name) for idx, field in enumerate(fds.keys()): try: field_types = fds[field] except KeyError: field_types = ['String'] # TODO: Need to figure out how to parse a LatLong field type if 'Price' in field_types: col_def = 'COALESCE(CAST("{0}" AS DOUBLE PRECISION), 0.0) AS {0}'.format(field) else: col_def = 'CAST(TRIM(COALESCE(LOWER("{0}"), \'\')) AS VARCHAR) AS {0}'.format(field) if idx < len(fds.keys()) - 1: proc_ins += '{0}, '.format(col_def) else: proc_ins += '{0} '.format(col_def) else: proc_ins += 'FROM "raw_{0}" WHERE record_id = :record_id)'\ .format(session_id) with engine.begin() as conn: record_id = conn.execute(raw_table.insert()\ .returning(raw_table.c.record_id) , **obj) conn.execute(text(proc_ins), record_id=record_id) hash_me = ';'.join([preProcess(unicode(obj[i])) for i in fds.keys()]) md5_hash = md5(unidecode(hash_me)).hexdigest() entity = { 'entity_id': unicode(uuid4()), 'record_id': record_id, 'source_hash': md5_hash, 'clustered': True, 'checked_out': False, } entity_table = Table('entity_{0}'.format(session_id), Base.metadata, autoload=True, autoload_with=engine, keep_existing=True) if match_id: entity['target_record_id'] = match_id entity_id = db_session.query(entity_table.c.entity_id)\ .filter(entity_table.c.record_id == match_id)\ .first() entity['entity_id'] = entity_id.entity_id with engine.begin() as conn: conn.execute(entity_table.insert(), **entity) deduper = dedupe.StaticGazetteer(StringIO(sess.gaz_settings_file)) for k,v in obj.items(): obj[k] = preProcess(unicode(v)) block_keys = [{'record_id': b[1], 'block_key': b[0]} \ for b in list(deduper.blocker([(record_id, obj)]))] with engine.begin() as conn: conn.execute(text(''' INSERT INTO "match_blocks_{0}" ( block_key, record_id ) VALUES (:block_key, :record_id) '''.format(sess.id)), *block_keys) if sess.review_count: sess.review_count = sess.review_count - 1 db_session.add(sess) db_session.commit() resp = make_response(json.dumps(r), status_code) resp.headers['Content-Type'] = 'application/json' return resp