Пример #1
0
def svmall42(work, match, config, score):
    sys.exit() #MUST BE FIXED
    features = svmfeatures(work, match)
    matchtxt = mt_tmp.matchtextPerson(work, config['persons'], config['families'])
    #lucene score
    if not score:
        candidates = search(matchtxt, work['sex'], ant=30, config=common.config) #Lucene search
        score = 0.0
        for (kid,sc) in candidates:
            #print kid, match['_id'], sc
            if str(kid) == str(match['_id']):
                score = sc
                break
    try: score = score / norm[config['matchDBname']]
    except: score = score / norm['default']
    if score == 0.0: score = 0.01 #??
    features.append(score)
    features.append(nodeSim(work, match))
    workFam = config['families'].find_one({ 'children': work['_id']})
    matchFam = config['match_families'].find_one({ 'children': match['_id']})
    features.append(familySim(workFam, config['persons'], matchFam, config['match_persons']))
    features.append(antFeaturesNorm(work, match))
    cand_matchtxt = mt_tmp.matchtextPerson(match, config['match_persons'], config['match_families'])
    features.append(cos(matchtxt, cand_matchtxt))
    #('namesim', 'Name similarity', 'numeric'),
    features.append(compName(work['name'].replace('/',''), match['name'].replace('/','')))
    nw = work['name'].split('/')
    nm = match['name'].split('/')
    #('gnamesim', 'Given name similarity', 'numeric'),
    features.append(compName(nw[0], nm[0]))
    #('lnamesim', 'Last name similarity', 'numeric'),
    features.append(compName(nw[1], nm[1]))
    #('nameedit', 'Name edit distance', 'numeric'),
    features.append(strSim(work['name'].replace('/',''), match['name'].replace('/','')))
    #('gnameedit', 'Given name edit distance', 'numeric'),
    features.append(strSim(nw[0], nm[0]))
    #('lnameedit', 'Last name edit distance', 'numeric'),
    features.append(strSim(nw[1], nm[1]))
    #('birthDate', 'Birthdate similarity'), 'numeric'),
    try: features.append(dateSim(work['birth']['date'], match['birth']['date']))
    except: features.append(dateSim(None, None))
    #('birthYear', 'Birthyear similarity'), 'numeric'),
    try: features.append(dateSim(work['birth']['date'][0:4], match['birth']['date'][0:4]))
    except: features.append(dateSim(None, None))
    #('birthPlace', 'Birthplace edit distance'), 'numeric'),
    try: features.append(strSim(work['birth']['place'], match['birth']['place']))
    except: features.append(strSim(None, None))
    #('deathDate', 'Deathdate similarity'), 'numeric'),
    try: features.append(dateSim(work['death']['date'], match['death']['date']))
    except: features.append(dateSim(None, None))
    #('deathYear', 'Deathyear similarity'), 'numeric'),
    try: features.append(dateSim(work['death']['date'][0:4], match['death']['date'][0:4]))
    except: features.append(dateSim(None, None))
    #('deathPlace' edit distnace
    try: features.append(strSim(work['death']['place'], match['death']['place']))
    except: features.append(strSim(None, None))

    return cleanupVect(features)
Пример #2
0
done = []
pant = 0
matchant = 0
maxSortVal = 0
minScore = 100000
for p in person_list.find().batch_size(50):
    pant += 1
    ptid = (time.time() - t0)/pant
#    print 'Time:',time.time() - t0, pant, ptid, p['refId'],p['name']

    matchtxt = mt_tmp.matchtextPerson(p, person_list, fam_list, config['relations'])
    #Ta bort * och ? från matchtxt? KOLLA
    if not matchtxt:
        print 'No matchtextdata',p['_id'],p['refId']
        continue       ##########FIX!!!!!!!!!!
    candidates = search(matchtxt, p['sex'], 10) #Lucene search
    for (candId,score) in candidates:
        if (candId == p['_id']):
            #same person - insert dummy match med status EjOK
            matches.insert({'workid': p['_id'], 'matchid': p['_id'], 'status': 'EjOK'})
            continue
#Dont match A->B and B->A
        key = p['_id']+':'+candId
        if key in done: continue
        done.append(key)
        key = candId+':'+p['_id']
        if key in done: continue
        done.append(key)
        #ingen förälder gemensam
        if commonParents(p['_id'], candId) > 0: continue
        #ingår i samma relation (Far/Son, Mor/Dotter)
Пример #3
0
fam_matches = config['fam_matches']
fam_matches.drop()

match_person = config['match_persons']
match_family = config['match_families']

dubltmp = defaultdict(list)
dbltmpNs = defaultdict(float)
doPersons = set()
for fam in config['families'].find({}, no_cursor_timeout=True):
    #tot+=1
    #print 'Fam', fam['refId'], fam['_id']
    matchtxt = mt_tmp.matchtextFamily(fam, config['families'],
                                           config['persons'],
                                           config['relations'])
    candidates = search(matchtxt, 'FAM', 2) #Lucene search
    sc = 0
    for c in candidates:
        if c[1] < 45.0: break  #breakoff score point for considering family match
        if (sc/c[1] > 2.0): break
        if (c[1]> sc): sc = c[1]
        #famMatchData = matchFam(fam['_id'], c[0], config)
        ##famMatchSummary[(tFamId,rFamId)] = famMatchData['summary']
        #fam_matches.insert(famMatchData)
        #Personmatches?? spara alla personer i fam i set doPersons
        for rel in config['relations'].find({'famId': fam['_id']}):
            doPersons.add(rel['persId'])
logging.info('%d persons to consider', len(doPersons))
logging.info('Time %s',time.time() - t0)

#Kör         match doPersons med lucene search
Пример #4
0
def svmall42(work, match, config, score):
    sys.exit()  #MUST BE FIXED
    features = svmfeatures(work, match)
    matchtxt = mt_tmp.matchtextPerson(work, config['persons'],
                                      config['families'])
    #lucene score
    if not score:
        candidates = search(matchtxt,
                            work['sex'],
                            ant=30,
                            config=common.config)  #Lucene search
        score = 0.0
        for (kid, sc) in candidates:
            #print kid, match['_id'], sc
            if str(kid) == str(match['_id']):
                score = sc
                break
    try:
        score = score / norm[config['matchDBname']]
    except:
        score = score / norm['default']
    if score == 0.0: score = 0.01  #??
    features.append(score)
    features.append(nodeSim(work, match))
    workFam = config['families'].find_one({'children': work['_id']})
    matchFam = config['match_families'].find_one({'children': match['_id']})
    features.append(
        familySim(workFam, config['persons'], matchFam,
                  config['match_persons']))
    features.append(antFeaturesNorm(work, match))
    cand_matchtxt = mt_tmp.matchtextPerson(match, config['match_persons'],
                                           config['match_families'])
    features.append(cos(matchtxt, cand_matchtxt))
    #('namesim', 'Name similarity', 'numeric'),
    features.append(
        compName(work['name'].replace('/', ''), match['name'].replace('/',
                                                                      '')))
    nw = work['name'].split('/')
    nm = match['name'].split('/')
    #('gnamesim', 'Given name similarity', 'numeric'),
    features.append(compName(nw[0], nm[0]))
    #('lnamesim', 'Last name similarity', 'numeric'),
    features.append(compName(nw[1], nm[1]))
    #('nameedit', 'Name edit distance', 'numeric'),
    features.append(
        strSim(work['name'].replace('/', ''), match['name'].replace('/', '')))
    #('gnameedit', 'Given name edit distance', 'numeric'),
    features.append(strSim(nw[0], nm[0]))
    #('lnameedit', 'Last name edit distance', 'numeric'),
    features.append(strSim(nw[1], nm[1]))
    #('birthDate', 'Birthdate similarity'), 'numeric'),
    try:
        features.append(dateSim(work['birth']['date'], match['birth']['date']))
    except:
        features.append(dateSim(None, None))
    #('birthYear', 'Birthyear similarity'), 'numeric'),
    try:
        features.append(
            dateSim(work['birth']['date'][0:4], match['birth']['date'][0:4]))
    except:
        features.append(dateSim(None, None))
    #('birthPlace', 'Birthplace edit distance'), 'numeric'),
    try:
        features.append(strSim(work['birth']['place'],
                               match['birth']['place']))
    except:
        features.append(strSim(None, None))
    #('deathDate', 'Deathdate similarity'), 'numeric'),
    try:
        features.append(dateSim(work['death']['date'], match['death']['date']))
    except:
        features.append(dateSim(None, None))
    #('deathYear', 'Deathyear similarity'), 'numeric'),
    try:
        features.append(
            dateSim(work['death']['date'][0:4], match['death']['date'][0:4]))
    except:
        features.append(dateSim(None, None))
    #('deathPlace' edit distnace
    try:
        features.append(strSim(work['death']['place'],
                               match['death']['place']))
    except:
        features.append(strSim(None, None))

    return cleanupVect(features)