Python normalize示例，text.normalize Python示例

示例#1

0

显示文件

def search(raw_query, query_type='/fast/all'):
    """
    Hit the FAST API for names.
    """
    out = []
    unique_fast_ids = []
    query = text.normalize(raw_query, PY3).replace('the university of',
                                                   'university of').strip()
    query_type_meta = [i for i in refine_to_fast if i['id'] == query_type]
    if query_type_meta == []:
        query_type_meta = default_query
    query_index = query_type_meta[0]['index']
    try:
        #FAST api requires spaces to be encoded as %20 rather than +
        url = api_base_url + '?query=' + urllib.parse.quote(query)
        url += '&rows=30&queryReturn=suggestall%2Cidroot%2Cauth%2ctag%2cscore&suggest=autoSubject'
        url += '&queryIndex=' + query_index + '&wt=json'
        app.logger.debug("FAST API url is " + url)
        resp = requests.get(url)
        results = resp.json()
    except Exception as e:
        app.logger.warning(e)
        return out
    for position, item in enumerate(results['response']['docs']):
        match = False
        name = item.get('auth')
        tag = item.get('tag')
        alternate = item.get('suggestall')
        if (len(alternate) > 0):
            alt = alternate[0]
        else:
            alt = ''
        fid = item.get('idroot')
        fast_uri = make_uri(fid)
        #The FAST service returns many duplicates.  Avoid returning many of the
        #same result
        if fid in unique_fast_ids:
            continue
        else:
            unique_fast_ids.append(fid)
        #score_1 = fuzz.token_sort_ratio(query, name)
        #score_2 = fuzz.token_sort_ratio(query, alt)
        #Return a maximum score
        #score = max(score_1, score_2)
        if query == text.normalize(name, PY3):
            match = True
        elif query == text.normalize(alt, PY3):
            match = True
        resource = {
            "id": fast_uri,
            "name": name,
            "score": tag,
            "match": match,
            "type": query_type_meta
        }
        out.append(resource)
    #Sort this list by score
    sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
    #Refine only will handle top three matches.
    return sorted_out[:3]

示例#2

0

显示文件

文件： reconcile.py 项目： remerjohnson/fast-reconcile

def search(raw_query, query_type='/fast/all'):
    """
    Hit the FAST API for names.
    """
    out = []
    unique_fast_ids = []
    query = text.normalize(raw_query, PY3).replace('the university of', 'university of').strip()
    query_type_meta = [i for i in refine_to_fast if i['id'] == query_type]
    if query_type_meta == []:
        query_type_meta = default_query
    query_index = query_type_meta[0]['index']
    try:
        #FAST api requires spaces to be encoded as %20 rather than +
        url = api_base_url + '?query=' + urllib.quote(query)
        url += '&rows=30&queryReturn=suggestall%2Cidroot%2Cauth%2ctag%2cscore&suggest=autoSubject'
        url += '&queryIndex=' + query_index + '&wt=json'
        app.logger.debug("FAST API url is " + url)
        resp = requests.get(url)
        results = resp.json()
    except Exception as e:
        app.logger.warning(e)
        return out
    for position, item in enumerate(results['response']['docs']):
        match = False
        name = item.get('auth')
        tag = item.get('tag')
        alternate = item.get('suggestall')
        if (len(alternate) > 0):
            alt = alternate[0]
        else:
            alt = ''
        fid = item.get('idroot')
        fast_uri = make_uri(fid)
        #The FAST service returns many duplicates.  Avoid returning many of the
        #same result
        if fid in unique_fast_ids:
            continue
        else:
            unique_fast_ids.append(fid)
        #score_1 = fuzz.token_sort_ratio(query, name)
        #score_2 = fuzz.token_sort_ratio(query, alt)
        #Return a maximum score
        #score = max(score_1, score_2)
        if query == text.normalize(name, PY3):
            match = True
        elif query == text.normalize(alt, PY3):
            match = True
        resource = {
            "id": fast_uri,
            "name": name,
            "score": tag,
            "match": match,
            "type": query_type_meta
        }
        out.append(resource)
    #Sort this list by score
    sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
    #Refine only will handle top three matches.
    return sorted_out[:3]

示例#3

0

显示文件

def search(raw_query, authtype, limit=3):
    """
    Hit the QA API for names.
    """
    out = []
    unique_ids = []
    query = text.normalize(raw_query).strip()
    match = False
    for qtype in auth_map[authtype]:
        if match: break

        auth, subauth = split_id(qtype)

        query_type_meta = [{"id": subauth, "name": authtype}]

        url = base_url + auth + "/" + subauth
        url += '?q=' + urllib.quote(query)

        try:
            resp = requests.get(url)
            results = json.loads(resp.text)
        except Exception as e:
            app.logger.error(e)
            sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
            return sorted_out[:int(limit)]

        for position, item in enumerate(results):
            if position > max_results or match: break

            uri = item["id"]
            name = item["label"]

            #Avoid returning many of the
            #same result
            if uri in unique_ids:
                continue
            else:
                unique_ids.append(uri)

            score_1 = fuzz.token_sort_ratio(query, name)
            score_2 = fuzz.token_sort_ratio(raw_query, name)

            #Return a maximum score
            score = max(score_1, score_2)
            if query == text.normalize(name) or raw_query == text.normalize(
                    name):
                match = True
            resource = {
                "id": uri,
                "name": name,
                "score": score,
                "match": match,
                "type": query_type_meta
            }
            out.append(resource)
    #Sort this list by score
    sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
    return sorted_out[:limit]

示例#4

0

显示文件

文件： qa_reconcile.py 项目： UCSCLibrary/qa-or-reconcile

def search(raw_query, auth, subauth, limit):
    """
    Hit the QA API
    """
    out = []
    unique_ids = []
    query = text.normalize(raw_query).strip()

    full_name = authority_names[auth]["name"] + " " + authority_names[auth][
        "subauthorities"][subauth]
    query_type_meta = [{"id": full_id(auth, subauth), "name": full_name}]

    url = base_url + auth + "/" + subauth
    url += '?q=' + urllib.quote(query)

    try:
        resp = requests.get(url)
        results = json.loads(resp.text)
    except Exception as e:
        app.logger.warning(e)
        return out
    match = False
    for position, item in enumerate(results):
        if match: break

        uri = item["id"]
        name = item["label"]

        #Avoid returning many of the
        #same result
        if uri in unique_ids:
            continue
        else:
            unique_ids.append(uri)

        score_1 = fuzz.token_sort_ratio(query, name)
        score_2 = fuzz.token_sort_ratio(raw_query, name)
        #Return a maximum score
        score = max(score_1, score_2)
        if query == text.normalize(name) or raw_query == text.normalize(name):
            match = True
        resource = {
            "id": uri,
            "name": name,
            "score": score,
            "match": match,
            "type": query_type_meta
        }
        out.append(resource)
    #Sort this list by score
    sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
    #Refine chooses how many matches to return.
    return sorted_out[:limit]

示例#5

0

显示文件

文件： classify.py 项目： pudo/wahlprogramme

def classify():
    decisive = map(norm, open('decisive.txt', 'rb').readlines())
    loriot = list(tokenize(open('loriot.txt', 'rb').read().decode('utf-8')))
    #print decisive
    #return
    platforms = load_platforms()
    scores = defaultdict(dict)
    for party, sections in platforms.items():
        for section in sections:
            scores[party][section.key] = {'tokens': len(section)}
            text = normalize(section.text)
            n_decisive = 0.0
            for phrase in decisive:
                if phrase in text:
                    n_decisive += 1
            scores[party][section.key]['decisive'] = n_decisive/len(section)
            n_loriot = 0.0
            for token in loriot:
                if token in text:
                    n_loriot += 1
            scores[party][section.key]['loriot'] = n_loriot/len(section)
            #terms = section_terms(model, section)
            #terms = [(t, s) for t, s in terms]
            #print [party, section.title, [t for t, s in terms[:10]]]
    #pprint(scores)
    with open('data/language.json', 'wb') as fh:
        json.dump(dict(scores), fh, indent=2)

示例#6

0

显示文件

文件： reconcile.py 项目： cmh2166/ldfAAT-reconcile

def search(raw_query, query_type='/object/literal'):
    """
    Call the LDF AAT Server (on Heroku) for matching triples
    """
    #Making empty arrays for storing recon objects, already reconciled URIs
    out = []
    unique_aat_ids = []

    #Determing the Triple part/OpenRefine query index to be used below
    query_type_meta = [i for i in refine_to_aat if i['id'] == query_type]
    if query_type_meta == []:
        query_type_meta = default_query
    query_index = query_type_meta[0]['index']

    #Cleaning/normalizing the query/data to be reconciled taken from OpenRefine
    #Also dependent on index/triple component, due to LDF URL query structure
    #subject URIs need to be checked are URIs (not just IDs), and if not, made into URIs
    #objects literals need to be put into "", given language declaration
    #currently just using @en default language declaration; need to find way to support
    #multiple/user-designated language declarations -
    #use OR recon service API query other fields/columns for this?
    query = text.normalize(raw_query, PY3).strip()

    #Create the LDF Server Request URL/Triple Pattern Fragments API
    if query_type=="/subject/URI":
        #Structure for getting labels from URIs
        #http://myldfserver.org/vocab?subject=URI&predicate=prefLabel
        try:
            if PY3:
                url = LDF_base_url + '?subject=' + urllib.parse.quote(query)
            else:
                url = LDF_base_url + '?subject=' + urllib.quote(query)
            app.logger.debug("LDF query url is " + url)
            rdfrecon.getRDFobject(url, PY3)
            # rest of work
        except getopt.GetoptError as e:
            app.logger.warning(e)
            return out
    else:
        #Treat everything else as object literals wanting labels
        #Structure for getting URIs
        #http://myldfserver.org/vocab?predicate=[prefLabel|altLabel|label]&object="query"@en
        try:
            if PY3:
                url = LDF_base_url + '?object=' + urllib.parse.quote(query)
            else:
                url = LDF_base_url + '?object=' + urllib.quote(query)
            app.logger.debug("LDF query url is " + url)
            rdfrecon.getRDFsubject(url, PY3)
            #rest of work
        except getopt.GetoptError as e:
            app.logger.warning(e)
            return out
        #Now use rdflib to parse results, get triple components



        prefLabel = 'http://www.w3.org/2004/02/skos/core#prefLabel'
        altLabel = 'http://www.w3.org/2004/02/skos/core#altLabel'
        label = 'http://www.w3.org/2000/01/rdf-schema#label'

示例#7

0

显示文件

def search(raw_query, query_type='/lc'):
    """
    Hit the LoC Authorities API for names.
    """
    out = []
    query = text.normalize(raw_query, PY3).strip()
    query_type_meta = [i for i in refine_to_lc if i['id'] == query_type]
    if query_type_meta == []:
        query_type_meta = default_query
    query_index = query_type_meta[0]['index']
    try:
        if PY3:
            url = "http://id.loc.gov" + query_index + '/suggest/?q=' + urllib.parse.quote(
                query)
        else:
            url = "http://id.loc.gov" + query_index + '/suggest/?q=' + urllib.quote(
                query)
        app.logger.debug("LC Authorities API url is " + url)
        resp = requests.get(url)
        results = resp.json()
    except getopt.GetoptError as e:
        app.logger.warning(e)
        return out
    for n in range(0, len(results[1])):
        match = False
        name = results[1][n]
        lc_uri = results[3][n]
        #Get score for label found
        score_1 = fuzz.token_sort_ratio(query, text.normalize(name, PY3))
        score = score_1
        # THIS IS WHERE I WILL GRAB ALTLABELS FROM URI.SKOS.NT ONCE I GET THAT PART WORKING => GIT BRANCH ALTLABEL
        if score > 95:
            match = True
        app.logger.debug("Label is " + name + " Score is " + str(score) +
                         " URI is " + lc_uri)
        resource = {
            "id": lc_uri,
            "name": name,
            "score": score,
            "match": match,
            "type": query_type_meta
        }
        out.append(resource)
    #Sort this list by score
    sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
    #Refine only will handle top three matches.
    return sorted_out[:3]

示例#8

0

显示文件

文件： reconcile.py 项目： barmintor/lc-reconcile

def search(raw_query, query_type='/lc'):
    """
    Hit the LoC Authorities API for names.
    """
    out = []
    query = text.normalize(raw_query, PY3).strip()
    query_type_meta = [i for i in refine_to_lc if i['id'] == query_type]
    if query_type_meta == []:
        query_type_meta = default_query
    query_index = query_type_meta[0]['index']
    try:
        if PY3:
            url = "http://id.loc.gov" + query_index  + '/suggest/?q=' + urllib.parse.quote(query)
        else:
            url = "http://id.loc.gov" + query_index  + '/suggest/?q=' + urllib.quote(query)
        app.logger.debug("LC Authorities API url is " + url)
        resp = requests.get(url)
        results = resp.json()
    except getopt.GetoptError as e:
        app.logger.warning(e)
        return out
    for n in range(0, len(results[1])):
        match = False
        name = results[1][n]
        lc_uri = results[3][n]
        #Get score for label found
        score_1 = fuzz.token_sort_ratio(query, text.normalize(name, PY3))
        score = score_1
        # THIS IS WHERE I WILL GRAB ALTLABELS FROM URI.SKOS.NT ONCE I GET THAT PART WORKING => GIT BRANCH ALTLABEL
        if score > 95:
            match = True
        app.logger.debug("Label is " + name + " Score is " + str(score) + " URI is " + lc_uri)
        resource = {
            "id": lc_uri,
            "name": name,
            "score": score,
            "match": match,
            "type": query_type_meta
        }
        out.append(resource)
    #Sort this list by score
    sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
    #Refine only will handle top three matches.
    return sorted_out[:3]

示例#9

0

显示文件

文件： courts.py 项目： maxpmaxp/sterch.scrapingtools

def is_valid_attorney(attorney, defendant_fullname=None):
    """ Returns False if attorney is empty or is defendant """
    if not attorney: return False
    if defendant_fullname:
        if smart_cmp(defendant_fullname, attorney): return False
    attorney = normalize(attorney).upper().strip()
    if any(map(lambda s: s in attorney, ['NO ATTORNEY', 'PRO SE', 'UNKNOWN' , 'PRO PRE', "PROSE", "PROPRE", "PROPER", "PRO PER", 'UNREPRESENTED',
                                         'N/A', 'NO-ATTORNEY', 'NO ATTORNEY', 'PRO SE', 'UNKNOWN', 'PUBLIC', 'DEFENDER', 'DEFENDANT', 'RESPONDENT','RESPONDER',])) \
        or any(map(lambda s: s==attorney, ['NO', 'NONE', 'NA', 'N.A.', 'UNK', 'UNKNWN',])):
           return False
    return True

示例#10

0

显示文件

文件： lda.py 项目： persona1346/cs102

def lda(domain):

    common_texts = normalize(domain=domain)

    common_dictionary = Dictionary(common_texts)
    common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

    lda = LdaModel(common_corpus,
                   num_topics=2,
                   per_word_topics=True,
                   id2word=common_dictionary)

    # print(common_dictionary.token2id)
    return lda

示例#11

0

显示文件

def search(raw_query):
    out = []
    query = text.normalize(raw_query, PY3).strip()
    query_type_meta = [i for i in full_query]
    #query_index = query_type_meta[0]['index']

    # Get the results
    try:
        if PY3:
            url = api_base_url + \
                urllib.parse.quote(query) + '&logop=and&notes='
        else:
            url = api_base_url + urllib.quote(query) + '&logop=and&notes='
        app.logger.debug("AAT url is " + url)
        resp = requests.get(url)
        results = ET.fromstring(resp.content)
    except getopt.GetoptError as e:
        app.logger.warning(e)
        return out

    for child in results.iter('Preferred_Parent'):
        match = False
        try:
            name = re.sub(r'\[.+?\]', '', child.text.split(',')[0]).strip()
            # the termid is NOT the ID ! We have to find it in the first prefered parent
            id = re.search(r"\[(.+?)\]", child.text.split(',')[0]).group(1)
            score = fuzz.token_sort_ratio(query, name)
        except AttributeError:
            pass
        if score > 95:
            match = True
        app.logger.debug("Label is " + name + " Score is " + str(score) +
                         " URI is " + id)
        resource = {
            "id": id,
            "name": name,
            "score": score,
            "match": match,
            "type": query_type_meta
        }
        out.append(resource)

    # Sort this list containing prefterms by score
    sorted_out = sorted(out, key=itemgetter('score'), reverse=True)

    # Refine only will handle top 10 matches.
    return sorted_out[:10]

示例#12

0

显示文件

def search(raw_query, query_type='/geonames/all'):
    """
    Hit the GeoNames API for names.
    """
    out = []
    unique_geonames_ids = []
    mid_query = lc_parse.lc2geonames(raw_query)
    query = text.normalize(mid_query).strip()
    query_type_meta = [i for i in refine_to_geonames if i['id'] == query_type]
    if query_type_meta == []:
        query_type_meta = default_query
    query_index = query_type_meta[0]['index']
    try:
        url = api_base_url + query_index  + '=' + urllib.quote(query)
        app.logger.debug("GeoNames API url is " + url)
        resp = requests.get(url)
        results = resp.json()
    except Exception, e:
        app.logger.warning(e)
        return out

示例#13

0

显示文件

文件： reconcile.py 项目： lawlesst/fast-reconcile

def search(raw_query, query_type='/fast/all'):
    """
    Hit the FAST API for names.
    """
    out = []
    unique_fast_ids = []
    query = text.normalize(raw_query).replace('the university of', 'university of').strip()
    query_type_meta = [i for i in refine_to_fast if i['id'] == query_type]
    if query_type_meta == []:
        query_type_meta = default_query
    query_index = query_type_meta[0]['index']
    try:
        #FAST api requires spaces to be encoded as %20 rather than +
        url = api_base_url + '?query=' + urllib.quote(query)
        url += '&rows=30&queryReturn=suggestall%2Cidroot%2Cauth%2cscore&suggest=autoSubject'
        url += '&queryIndex=' + query_index + '&wt=json'
        app.logger.debug("FAST API url is " + url)
        resp = requests.get(url)
        results = resp.json()
    except Exception, e:
        app.logger.warning(e)
        return out

示例#14

0

显示文件

def search(raw_query, query_type='/fast/all'):
    """
    Hit the FAST API for names.
    """
    out = []
    unique_fast_ids = []
    query = text.normalize(raw_query).replace('the university of',
                                              'university of').strip()
    query_type_meta = [i for i in refine_to_fast if i['id'] == query_type]
    if query_type_meta == []:
        query_type_meta = default_query
    query_index = query_type_meta[0]['index']
    try:
        #FAST api requires spaces to be encoded as %20 rather than +
        url = api_base_url + '?query=' + urllib.quote(query)
        url += '&rows=30&queryReturn=suggestall%2Cidroot%2Cauth%2cscore&suggest=autoSubject'
        url += '&queryIndex=' + query_index + '&wt=json'
        app.logger.debug("FAST API url is " + url)
        resp = requests.get(url)
        results = resp.json()
    except Exception, e:
        app.logger.warning(e)
        return out

示例#15

0

显示文件

文件： reconcile.py 项目： cmh2166/isni-reconcile

def search(raw_query, query_type='/isni/name'):
    """
    Hit the ISNI API for names.
    """
    out = []
    unique_isni_ids = []
    query = text.normalize(raw_query, PY3).strip().replace(',', '')
    query_type_meta = [i for i in refine_to_isni if i['id'] == query_type]
    if query_type_meta == []:
        query_type_meta = default_query
    query_index = query_type_meta[0]['index']
    try:
        #ISNI api requires spaces to be encoded as %20 rather than +
        if PY3:
            url = api_base_url + query_index + "+%3D+'" + urllib.parse.quote(query) +"'"
        else:
            url = api_base_url + query_index + "+%3D+'" + urllib.quote(query) + "'"
        app.logger.debug("ISNI API url is " + url)
        resp = requests.get(url)
        results = etree.fromstring(resp.content)
    except Exception as e:
        app.logger.warning(e)
        return out
    for record in results.iter("{http://www.loc.gov/zing/srw/}record"):
        match = False
        names = []
        if record.xpath(".//personalName"):
            for pers in record.xpath(".//personalName"):
                try:
                    fname = pers.find("forename").text
                except:
                    fname = ''
                lname = pers.find("surname").text
                try:
                    date = pers.find("dates").text
                except:
                    date = ''
                name = str(fname) + " " + lname + ' ' + str(date)
                names.append(name.strip(''))
            refine_name = names[0]
        elif record.xpath(".//organisation"):
            for org in record.xpath(".//organisationName"):
                mainname = org.find("mainName").text
                try:
                    subname = org.find("subdivisionName").text
                except:
                    subname = ''
                name = mainname + ' ' + str(subname)
                name.strip('')
                names.append(name)
            refine_name = names[0]
        isni_uri = record.xpath(".//isniURI")[0].text
        if isni_uri in unique_isni_ids:
            continue
        else:
            unique_isni_ids.append(isni_uri)
        scores = set()
        app.logger.debug(names)
        for name in names:
            nscore = fuzz.token_sort_ratio(query, name)
            scores.add(nscore)
        score = max(scores)
        for name in names:
            if query == text.normalize(name, PY3):
                match = True
        resource = {
            "id": isni_uri,
            "name": refine_name,
            "score": score,
            "match": match,
            "type": query_type_meta
        }
        out.append(resource)
    #Sort this list by score
    sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
    #Refine only will handle top three matches.
    return sorted_out[:3]

示例#16

0

显示文件

文件： reconcile.py 项目： cmh2166/lc-reconcile

def search(raw_query, query_type='/lc'):
    out = []
    query = text.normalize(raw_query, PY3).strip()
    query_type_meta = [i for i in refine_to_lc if i['id'] == query_type]
    if query_type_meta == []:
        query_type_meta = default_query
    query_index = query_type_meta[0]['index']
    # Get the results for the primary suggest API (primary headings, no cross-refs)
    try:
        if PY3:
            url = "http://id.loc.gov" + query_index + '/suggest/?q=' + urllib.parse.quote(query.encode('utf8'))
        else:
            url = "http://id.loc.gov" + query_index + '/suggest/?q=' + urllib.quote(query.encode('utf8'))
        app.logger.debug("LC Authorities API url is " + url)
        resp = requests.get(url)
        results = resp.json()
    except getopt.GetoptError as e:
        app.logger.warning(e)
        return out
    for n in range(0, len(results[1])):
        match = False
        name = results[1][n]
        uri = results[3][n]
        score = fuzz.token_sort_ratio(query, name)
        if score > 95:
            match = True
        app.logger.debug("Label is " + name + " Score is " + str(score) + " URI is " + uri)
        resource = {
            "id": uri,
            "name": name,
            "score": score,
            "match": match,
            "type": query_type_meta
        }
        out.append(resource)
    # Get the results for the didyoumean API (cross-refs, no primary headings)
    try:
        if query_index != '/authorities':
            if PY3:
                url = "http://id.loc.gov" + query_index + '/didyoumean/?label=' + urllib.parse.quote(query.encode('utf8'))
            else:
                url = "http://id.loc.gov" + query_index + '/didyoumean/?label=' + urllib.quote(query.encode('utf8'))
            app.logger.debug("LC Authorities API url is " + url)
            altresp = requests.get(url)
            altresults = ET.fromstring(altresp.content)
            altresults2 = None
        else:
            if PY3:
                url = 'http://id.loc.gov/authorities/names/didyoumean/?label=' + urllib.parse.quote(query.encode('utf8'))
                url2 = 'http://id.loc.gov/authorities/subjects/didyoumean/?label=' + urllib.parse.quote(query.encode('utf8'))
            else:
                url = 'http://id.loc.gov/authorities/names/didyoumean/?label=' + urllib.quote(query.encode('utf8'))
                url2 = 'http://id.loc.gov/authorities/subjects/didyoumean/?label=' + urllib.quote(query.encode('utf8'))
            app.logger.debug("LC Authorities API url is " + url)
            app.logger.debug("LC Authorities API url is " + url2)
            altresp = requests.get(url)
            altresp2 = requests.get(url2)
            altresults = ET.fromstring(altresp.content)
            altresults2 = ET.fromstring(altresp2.content)
    except getopt.GetoptError as e:
        app.logger.warning(e)
        return out
    for child in altresults.iter('{http://id.loc.gov/ns/id_service#}term'):
        match = False
        name = child.text
        uri = child.get('uri')
        score = fuzz.token_sort_ratio(query, name)
        if score > 95:
            match = True
        app.logger.debug("Label is " + name + " Score is " + str(score) + " URI is " + uri)
        resource = {
            "id": uri,
            "name": name,
            "score": score,
            "match": match,
            "type": query_type_meta
        }
        out.append(resource)
    if altresults2 is not None:
        for child in altresults2.iter('{http://id.loc.gov/ns/id_service#}term'):
            match = False
            name = child.text
            uri = child.get('uri')
            score = fuzz.token_sort_ratio(query, name)
            if score > 95:
                match = True
            app.logger.debug("Label is " + name + " Score is " + str(score) + " URI is " + uri)
            resource = {
                "id": uri,
                "name": name,
                "score": score,
                "match": match,
                "type": query_type_meta
            }
            out.append(resource)
    # Sort this list containing preflabels and crossrefs by score
    sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
    # Refine only will handle top three matches.
    return sorted_out[:3]

示例#17

0

显示文件

文件： reconcile.py 项目： lawlesst/fast-reconcile

         alt = alternate[0]
     else:
         alt = ''
     fid = item.get('idroot')
     fast_uri = make_uri(fid)
     #The FAST service returns many duplicates.  Avoid returning many of the
     #same result
     if fid in unique_fast_ids:
         continue
     else:
         unique_fast_ids.append(fid)
     score_1 = fuzz.token_sort_ratio(query, name)
     score_2 = fuzz.token_sort_ratio(query, alt)
     #Return a maximum score
     score = max(score_1, score_2)
     if query == text.normalize(name):
         match = True
     elif query == text.normalize(alt):
         match = True
     resource = {
         "id": fast_uri,
         "name": name,
         "score": score,
         "match": match,
         "type": query_type_meta
     }
     out.append(resource)
 #Sort this list by score
 sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
 #Refine only will handle top three matches.
 return sorted_out[:3]

示例#18

0

显示文件

         alt = ''
     geonames_id = item.get('geonameId')
     geonames_uri = make_uri(geonames_id)
     lat = item.get('lat')
     lng = item.get('lng')
     #Way to cheat + get name + coordinates into results:
     name_coords = name + ' | ' + lat + ', ' + lng
     #Avoid returning duplicates:
     if geonames_id in unique_geonames_ids:
         continue
     else:
         unique_geonames_ids.append(geonames_id)
     score_1 = fuzz.token_sort_ratio(query, name)
     score_2 = fuzz.token_sort_ratio(query, alt)
     score = max(score_1, score_2)
     if query == text.normalize(name):
         match = True
     elif query == text.normalize(alt):
         match = True
     resource = {
         "id": geonames_uri,
         "name": name_coords,
         "score": score,
         "match": match,
         "type": query_type_meta
     }
     out.append(resource)
 #Sort this list by score
 sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
 #Refine only will handle top three matches.
 return sorted_out[:3]

示例#19

0

显示文件

文件： reconcile.py 项目： cmh2166/fast-reconcile

         alt = alternate[0]
     else:
         alt = ''
     fid = item.get('idroot')
     fast_uri = make_uri(fid)
     #The FAST service returns many duplicates.  Avoid returning many of the
     #same result
     if fid in unique_fast_ids:
         continue
     else:
         unique_fast_ids.append(fid)
     score_1 = fuzz.token_sort_ratio(query, name)
     score_2 = fuzz.token_sort_ratio(query, alt)
     #Return a maximum score
     score = max(score_1, score_2)
     if query == text.normalize(name, PY3):
         match = True
     elif query == text.normalize(alt, PY3):
         match = True
     resource = {
         "id": fast_uri,
         "name": name,
         "score": score,
         "match": match,
         "type": query_type_meta
     }
     out.append(resource)
 #Sort this list by score
 sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
 #Refine only will handle top three matches.
 return sorted_out[:3]

示例#20

0

显示文件

文件： classify.py 项目： pudo/wahlprogramme

def norm(text):
    text = text.decode('utf-8')
    return normalize(text)

示例#21

0

显示文件

文件： reconcile.py 项目： cmh2166/geonames-reconcile

def search(raw_query, query_type='/geonames/all'):
    """
    Hit the GeoNames API for names.
    """
    out = []
    unique_geonames_ids = []
    mid_query = lc_parse.lc2geonames(raw_query, PY3)
    query = text.normalize(mid_query, PY3).strip()
    query_type_meta = [i for i in refine_to_geonames if i['id'] == query_type]
    if query_type_meta == []:
        query_type_meta = default_query
    query_index = query_type_meta[0]['index']
    try:
        if PY3:
            url = api_base_url + query_index  + '=' + urllib.parse.quote(query)
        else:
            url = api_base_url + query_index  + '=' + urllib.quote(query)
        app.logger.debug("GeoNames API url is " + url)
        resp = requests.get(url)
        results = resp.json()
    except getopt.GetoptError as e:
        app.logger.warning(e)
        return out
    for position, item in enumerate(results['geonames']):
        match = False
        name = item.get('name')
        alternate = item.get('toponymName')
        if (len(alternate) > 0):
            alt = alternate[0]
        else:
            alt = ''
        geonames_id = item.get('geonameId')
        geonames_uri = make_uri(geonames_id)
        lat = item.get('lat')
        lng = item.get('lng')
        #Way to cheat + get name + coordinates into results:
        name_coords = name + ' | ' + lat + ', ' + lng
        #Avoid returning duplicates:
        if geonames_id in unique_geonames_ids:
            continue
        else:
            unique_geonames_ids.append(geonames_id)
        score_1 = fuzz.token_sort_ratio(query, name)
        score_2 = fuzz.token_sort_ratio(query, alt)
        score = max(score_1, score_2)
        if query == text.normalize(name, PY3):
            match = True
        elif query == text.normalize(alt, PY3):
            match = True
        resource = {
            "id": geonames_uri,
            "name": name_coords,
            "score": score,
            "match": match,
            "type": query_type_meta
        }
        out.append(resource)
    #Sort this list by score
    sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
    #Refine only will handle top three matches.
    return sorted_out[:3]

示例#22

0

显示文件

文件： reconcile.py 项目： yorkulibraries/lc-reconcile

def search(raw_query, query_type='/lc'):
    out = []
    query = text.normalize(raw_query, PY3).strip()
    query_type_meta = [i for i in refine_to_lc if i['id'] == query_type]
    if query_type_meta == []:
        query_type_meta = default_query
    query_index = query_type_meta[0]['index']
    # Get the results for the primary suggest API (primary headings, no cross-refs)
    try:
        if PY3:
            url = "http://id.loc.gov" + query_index + '/suggest/?q=' + urllib.parse.quote(
                query.encode('utf8'))
        else:
            url = "http://id.loc.gov" + query_index + '/suggest/?q=' + urllib.quote(
                query.encode('utf8'))
        app.logger.debug("LC Authorities API url is " + url)
        resp = requests.get(url)
        results = resp.json()
    except getopt.GetoptError as e:
        app.logger.warning(e)
        return out
    for n in range(0, len(results[1])):
        match = False
        name = results[1][n]
        uri = results[3][n]
        score = fuzz.token_sort_ratio(query, name)
        if score > 95:
            match = True
        app.logger.debug("Label is " + name + " Score is " + str(score) +
                         " URI is " + uri)
        resource = {
            "id": uri,
            "name": name,
            "score": score,
            "match": match,
            "type": query_type_meta
        }
        out.append(resource)
    # Get the results for the didyoumean API (cross-refs, no primary headings)
    try:
        if query_index != '/authorities':
            if PY3:
                url = "http://id.loc.gov" + query_index + '/didyoumean/?label=' + urllib.parse.quote(
                    query.encode('utf8'))
            else:
                url = "http://id.loc.gov" + query_index + '/didyoumean/?label=' + urllib.quote(
                    query.encode('utf8'))
            app.logger.debug("LC Authorities API url is " + url)
            altresp = requests.get(url)
            altresults = ET.fromstring(altresp.content)
            altresults2 = None
        else:
            if PY3:
                url = 'http://id.loc.gov/authorities/names/didyoumean/?label=' + urllib.parse.quote(
                    query.encode('utf8'))
                url2 = 'http://id.loc.gov/authorities/subjects/didyoumean/?label=' + urllib.parse.quote(
                    query.encode('utf8'))
            else:
                url = 'http://id.loc.gov/authorities/names/didyoumean/?label=' + urllib.quote(
                    query.encode('utf8'))
                url2 = 'http://id.loc.gov/authorities/subjects/didyoumean/?label=' + urllib.quote(
                    query.encode('utf8'))
            app.logger.debug("LC Authorities API url is " + url)
            app.logger.debug("LC Authorities API url is " + url2)
            altresp = requests.get(url)
            altresp2 = requests.get(url2)
            altresults = ET.fromstring(altresp.content)
            altresults2 = ET.fromstring(altresp2.content)
    except getopt.GetoptError as e:
        app.logger.warning(e)
        return out
    for child in altresults.iter('{http://id.loc.gov/ns/id_service#}term'):
        match = False
        name = child.text
        uri = child.get('uri')
        score = fuzz.token_sort_ratio(query, name)
        if score > 95:
            match = True
        app.logger.debug("Label is " + name + " Score is " + str(score) +
                         " URI is " + uri)
        resource = {
            "id": uri,
            "name": name,
            "score": score,
            "match": match,
            "type": query_type_meta
        }
        out.append(resource)
    if altresults2 is not None:
        for child in altresults2.iter(
                '{http://id.loc.gov/ns/id_service#}term'):
            match = False
            name = child.text
            uri = child.get('uri')
            score = fuzz.token_sort_ratio(query, name)
            if score > 95:
                match = True
            app.logger.debug("Label is " + name + " Score is " + str(score) +
                             " URI is " + uri)
            resource = {
                "id": uri,
                "name": name,
                "score": score,
                "match": match,
                "type": query_type_meta
            }
            out.append(resource)
    # Sort this list containing preflabels and crossrefs by score
    sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
    # Refine only will handle top three matches.
    return sorted_out[:3]

示例#23

0

显示文件

def search(raw_query, query_type='/isni/name'):
    """
    Hit the ISNI API for names.
    """
    out = []
    unique_isni_ids = []
    query = text.normalize(raw_query, PY3).strip().replace(',', '')
    query_type_meta = [i for i in refine_to_isni if i['id'] == query_type]
    if query_type_meta == []:
        query_type_meta = default_query
    query_index = query_type_meta[0]['index']
    try:
        #ISNI api requires spaces to be encoded as %20 rather than +
        if PY3:
            url = api_base_url + query_index + "+%3D+'" + urllib.parse.quote(
                query) + "'"
        else:
            url = api_base_url + query_index + "+%3D+'" + urllib.quote(
                query) + "'"
        app.logger.debug("ISNI API url is " + url)
        resp = requests.get(url)
        results = etree.fromstring(resp.content)
    except Exception as e:
        app.logger.warning(e)
        return out
    for record in results.iter("{http://www.loc.gov/zing/srw/}record"):
        match = False
        names = []
        if record.xpath(".//personalName"):
            for pers in record.xpath(".//personalName"):
                try:
                    fname = pers.find("forename").text
                except:
                    fname = ''
                lname = pers.find("surname").text
                try:
                    date = pers.find("dates").text
                except:
                    date = ''
                name = str(fname) + " " + lname + ' ' + str(date)
                names.append(name.strip(''))
            refine_name = names[0]
        elif record.xpath(".//organisation"):
            for org in record.xpath(".//organisationName"):
                mainname = org.find("mainName").text
                try:
                    subname = org.find("subdivisionName").text
                except:
                    subname = ''
                name = mainname + ' ' + str(subname)
                name.strip('')
                names.append(name)
            refine_name = names[0]
        isni_uri = record.xpath(".//isniURI")[0].text
        if isni_uri in unique_isni_ids:
            continue
        else:
            unique_isni_ids.append(isni_uri)
        scores = set()
        app.logger.debug(names)
        for name in names:
            nscore = fuzz.token_sort_ratio(query, name)
            scores.add(nscore)
        score = max(scores)
        for name in names:
            if query == text.normalize(name, PY3):
                match = True
        resource = {
            "id": isni_uri,
            "name": refine_name,
            "score": score,
            "match": match,
            "type": query_type_meta
        }
        out.append(resource)
    #Sort this list by score
    sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
    #Refine only will handle top three matches.
    return sorted_out[:3]

示例#24

0

显示文件

def search(raw_query, query_type='/geonames/all'):
    """
    Hit the GeoNames API for names.
    """
    out = []
    unique_geonames_ids = []
    mid_query = lc_parse.lc2geonames(raw_query, PY3)
    query = text.normalize(mid_query, PY3).strip()
    query_type_meta = [i for i in refine_to_geonames if i['id'] == query_type]
    if query_type_meta == []:
        query_type_meta = default_query
    query_index = query_type_meta[0]['index']
    try:
        if PY3:
            url = api_base_url + query_index + '=' + urllib.parse.quote(query)
        else:
            url = api_base_url + query_index + '=' + urllib.quote(query)
        app.logger.debug("GeoNames API url is " + url)
        resp = requests.get(url)
        results = resp.json()
    except getopt.GetoptError as e:
        app.logger.warning(e)
        return out
    for position, item in enumerate(results['geonames']):
        match = False
        name = item.get('name')
        alternate = item.get('toponymName')
        if (len(alternate) > 0):
            alt = alternate[0]
        else:
            alt = ''
        geonames_id = item.get('geonameId')
        geonames_uri = make_uri(geonames_id)
        lat = item.get('lat')
        lng = item.get('lng')
        #Way to cheat + get name + coordinates into results:
        name_coords = name + ' | ' + lat + ', ' + lng
        #Avoid returning duplicates:
        if geonames_id in unique_geonames_ids:
            continue
        else:
            unique_geonames_ids.append(geonames_id)
        score_1 = fuzz.token_sort_ratio(query, name)
        score_2 = fuzz.token_sort_ratio(query, alt)
        score = max(score_1, score_2)
        if query == text.normalize(name, PY3):
            match = True
        elif query == text.normalize(alt, PY3):
            match = True
        resource = {
            "id": geonames_uri,
            "name": name_coords,
            "score": score,
            "match": match,
            "type": query_type_meta
        }
        out.append(resource)
    #Sort this list by score
    sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
    #Refine only will handle top three matches.
    return sorted_out[:3]