def svmall42(work, match, config, score): sys.exit() #MUST BE FIXED features = svmfeatures(work, match) matchtxt = mt_tmp.matchtextPerson(work, config['persons'], config['families']) #lucene score if not score: candidates = search(matchtxt, work['sex'], ant=30, config=common.config) #Lucene search score = 0.0 for (kid,sc) in candidates: #print kid, match['_id'], sc if str(kid) == str(match['_id']): score = sc break try: score = score / norm[config['matchDBname']] except: score = score / norm['default'] if score == 0.0: score = 0.01 #?? features.append(score) features.append(nodeSim(work, match)) workFam = config['families'].find_one({ 'children': work['_id']}) matchFam = config['match_families'].find_one({ 'children': match['_id']}) features.append(familySim(workFam, config['persons'], matchFam, config['match_persons'])) features.append(antFeaturesNorm(work, match)) cand_matchtxt = mt_tmp.matchtextPerson(match, config['match_persons'], config['match_families']) features.append(cos(matchtxt, cand_matchtxt)) #('namesim', 'Name similarity', 'numeric'), features.append(compName(work['name'].replace('/',''), match['name'].replace('/',''))) nw = work['name'].split('/') nm = match['name'].split('/') #('gnamesim', 'Given name similarity', 'numeric'), features.append(compName(nw[0], nm[0])) #('lnamesim', 'Last name similarity', 'numeric'), features.append(compName(nw[1], nm[1])) #('nameedit', 'Name edit distance', 'numeric'), features.append(strSim(work['name'].replace('/',''), match['name'].replace('/',''))) #('gnameedit', 'Given name edit distance', 'numeric'), features.append(strSim(nw[0], nm[0])) #('lnameedit', 'Last name edit distance', 'numeric'), features.append(strSim(nw[1], nm[1])) #('birthDate', 'Birthdate similarity'), 'numeric'), try: features.append(dateSim(work['birth']['date'], match['birth']['date'])) except: features.append(dateSim(None, None)) #('birthYear', 'Birthyear similarity'), 'numeric'), try: features.append(dateSim(work['birth']['date'][0:4], match['birth']['date'][0:4])) except: features.append(dateSim(None, None)) #('birthPlace', 'Birthplace edit distance'), 'numeric'), try: features.append(strSim(work['birth']['place'], match['birth']['place'])) except: features.append(strSim(None, None)) #('deathDate', 'Deathdate similarity'), 'numeric'), try: features.append(dateSim(work['death']['date'], match['death']['date'])) except: features.append(dateSim(None, None)) #('deathYear', 'Deathyear similarity'), 'numeric'), try: features.append(dateSim(work['death']['date'][0:4], match['death']['date'][0:4])) except: features.append(dateSim(None, None)) #('deathPlace' edit distnace try: features.append(strSim(work['death']['place'], match['death']['place'])) except: features.append(strSim(None, None)) return cleanupVect(features)
done = [] pant = 0 matchant = 0 maxSortVal = 0 minScore = 100000 for p in person_list.find().batch_size(50): pant += 1 ptid = (time.time() - t0)/pant # print 'Time:',time.time() - t0, pant, ptid, p['refId'],p['name'] matchtxt = mt_tmp.matchtextPerson(p, person_list, fam_list, config['relations']) #Ta bort * och ? från matchtxt? KOLLA if not matchtxt: print 'No matchtextdata',p['_id'],p['refId'] continue ##########FIX!!!!!!!!!! candidates = search(matchtxt, p['sex'], 10) #Lucene search for (candId,score) in candidates: if (candId == p['_id']): #same person - insert dummy match med status EjOK matches.insert({'workid': p['_id'], 'matchid': p['_id'], 'status': 'EjOK'}) continue #Dont match A->B and B->A key = p['_id']+':'+candId if key in done: continue done.append(key) key = candId+':'+p['_id'] if key in done: continue done.append(key) #ingen förälder gemensam if commonParents(p['_id'], candId) > 0: continue #ingår i samma relation (Far/Son, Mor/Dotter)
fam_matches = config['fam_matches'] fam_matches.drop() match_person = config['match_persons'] match_family = config['match_families'] dubltmp = defaultdict(list) dbltmpNs = defaultdict(float) doPersons = set() for fam in config['families'].find({}, no_cursor_timeout=True): #tot+=1 #print 'Fam', fam['refId'], fam['_id'] matchtxt = mt_tmp.matchtextFamily(fam, config['families'], config['persons'], config['relations']) candidates = search(matchtxt, 'FAM', 2) #Lucene search sc = 0 for c in candidates: if c[1] < 45.0: break #breakoff score point for considering family match if (sc/c[1] > 2.0): break if (c[1]> sc): sc = c[1] #famMatchData = matchFam(fam['_id'], c[0], config) ##famMatchSummary[(tFamId,rFamId)] = famMatchData['summary'] #fam_matches.insert(famMatchData) #Personmatches?? spara alla personer i fam i set doPersons for rel in config['relations'].find({'famId': fam['_id']}): doPersons.add(rel['persId']) logging.info('%d persons to consider', len(doPersons)) logging.info('Time %s',time.time() - t0) #Kör match doPersons med lucene search
def svmall42(work, match, config, score): sys.exit() #MUST BE FIXED features = svmfeatures(work, match) matchtxt = mt_tmp.matchtextPerson(work, config['persons'], config['families']) #lucene score if not score: candidates = search(matchtxt, work['sex'], ant=30, config=common.config) #Lucene search score = 0.0 for (kid, sc) in candidates: #print kid, match['_id'], sc if str(kid) == str(match['_id']): score = sc break try: score = score / norm[config['matchDBname']] except: score = score / norm['default'] if score == 0.0: score = 0.01 #?? features.append(score) features.append(nodeSim(work, match)) workFam = config['families'].find_one({'children': work['_id']}) matchFam = config['match_families'].find_one({'children': match['_id']}) features.append( familySim(workFam, config['persons'], matchFam, config['match_persons'])) features.append(antFeaturesNorm(work, match)) cand_matchtxt = mt_tmp.matchtextPerson(match, config['match_persons'], config['match_families']) features.append(cos(matchtxt, cand_matchtxt)) #('namesim', 'Name similarity', 'numeric'), features.append( compName(work['name'].replace('/', ''), match['name'].replace('/', ''))) nw = work['name'].split('/') nm = match['name'].split('/') #('gnamesim', 'Given name similarity', 'numeric'), features.append(compName(nw[0], nm[0])) #('lnamesim', 'Last name similarity', 'numeric'), features.append(compName(nw[1], nm[1])) #('nameedit', 'Name edit distance', 'numeric'), features.append( strSim(work['name'].replace('/', ''), match['name'].replace('/', ''))) #('gnameedit', 'Given name edit distance', 'numeric'), features.append(strSim(nw[0], nm[0])) #('lnameedit', 'Last name edit distance', 'numeric'), features.append(strSim(nw[1], nm[1])) #('birthDate', 'Birthdate similarity'), 'numeric'), try: features.append(dateSim(work['birth']['date'], match['birth']['date'])) except: features.append(dateSim(None, None)) #('birthYear', 'Birthyear similarity'), 'numeric'), try: features.append( dateSim(work['birth']['date'][0:4], match['birth']['date'][0:4])) except: features.append(dateSim(None, None)) #('birthPlace', 'Birthplace edit distance'), 'numeric'), try: features.append(strSim(work['birth']['place'], match['birth']['place'])) except: features.append(strSim(None, None)) #('deathDate', 'Deathdate similarity'), 'numeric'), try: features.append(dateSim(work['death']['date'], match['death']['date'])) except: features.append(dateSim(None, None)) #('deathYear', 'Deathyear similarity'), 'numeric'), try: features.append( dateSim(work['death']['date'][0:4], match['death']['date'][0:4])) except: features.append(dateSim(None, None)) #('deathPlace' edit distnace try: features.append(strSim(work['death']['place'], match['death']['place'])) except: features.append(strSim(None, None)) return cleanupVect(features)