def get_id_of_corresponding_algorithm(linktitle, page_title, fuzzy=False):
    id = convert_to_id(linktitle)
    result = es.get(index=INDEX_NAME, doc_type='algorithm',
        id=id, ignore=404)
    if result['found']:
        return id

    if fuzzy:
        body = {
            "query": {
                "match": {
                    "name": {
                        "query": "merge sort",
                        "fuzziness": "auto"
                    }
                }
            }
        }
        r = es.search(index=INDEX_NAME, doc_type='algorithm',
            body=body, size=1)
        if r['hits']['total'] > 0:
            return r['hits'][0]['_id']

    rd.sadd('rosetta-mapping-error-correspage-notfound',
        page_title)
    return None
示例#2
0
def get_id_of_corresponding_algorithm(linktitle, page_title, fuzzy=False):
    id = convert_to_id(linktitle)
    result = es.get(index=INDEX_NAME, doc_type='algorithm', id=id, ignore=404)
    if result['found']:
        return id

    if fuzzy:
        body = {
            "query": {
                "match": {
                    "name": {
                        "query": "merge sort",
                        "fuzziness": "auto"
                    }
                }
            }
        }
        r = es.search(index=INDEX_NAME,
                      doc_type='algorithm',
                      body=body,
                      size=1)
        if r['hits']['total'] > 0:
            return r['hits'][0]['_id']

    rd.sadd('rosetta-mapping-error-correspage-notfound', page_title)
    return None
示例#3
0
def nel_title_elasticsearch(page_title):
    # TODO search on name and alt_name
    if id is not None:
        rd.hset('rosetta-mapping-success', page_title, json.dumps([id]))
        rd.sadd('rosetta-mapping-success-wikipedia-autosuggest', page_title)
        safe_print(id)
        print '--second'
        return [id]
def nel_title_elasticsearch(page_title):
    # TODO search on name and alt_name
    if id is not None:
        rd.hset('rosetta-mapping-success', page_title, json.dumps([id]))
        rd.sadd('rosetta-mapping-success-wikipedia-autosuggest',
            page_title)
        safe_print(id)
        print '--second'
        return [id]
示例#5
0
def index_corresponding_algorithm(wikipage, linktitle, page_title):
    # try to index this algorithm
    if wikipage is not None:
        id = index_wiki_algorithm_entry(wikipage, linktitle, visitedwiki)
        return id

    rd.sadd('rosetta-mapping-error-indexing-error',
            page_title + ' -> ' + linktitle)
    return None
def index_corresponding_algorithm(wikipage, linktitle, page_title):
    # try to index this algorithm
    if wikipage is not None:
        id = index_wiki_algorithm_entry(wikipage, linktitle,
            visitedwiki)
        return id

    rd.sadd('rosetta-mapping-error-indexing-error',
        page_title + ' -> ' + linktitle)
    return None
def get_id_of_corresponding_algorithm(linktitle, page_title):
    id = convert_to_id(linktitle)
    # print '--looking for id:', id
    result = es.get(index=INDEX_NAME, doc_type='algorithm',
        id=id, ignore=404)
    if result['found']:
        return id

    rd.sadd('rosetta-mapping-error-correspage-notfound',
        page_title)
    return None
def get_sorted_similar_links(taskname, links):
    taskname = taskname.encode('utf8')
    choices = [link.encode('utf8') for link in links]
    try:
        res = process.extract(taskname, choices)
    except Exception as e:
        rd.sadd('rosetta-mapping-taskname-coding-error', str(e) + taskname)
        return []
    if res is not None:
        res = [link for (link, confidence) in res
            if confidence > FUZZY_THRESHOLD]
        return sorted(res, key=lambda x: x[1], reverse = True)
    else:
        return []
def get_sorted_similar_links(taskname, links):
    taskname = taskname.encode('utf8')
    choices = [link.encode('utf8') for link in links]
    try:
        res = process.extract(taskname, choices)
    except Exception as e:
        rd.sadd('rosetta-mapping-taskname-coding-error', str(e) + taskname)
        return []
    if res is not None:
        # print 'confidence: ', res
        res = [link for (link, confidence) in res
            if confidence > FUZZY_THRESHOLD]
        return sorted(res, key=lambda x: x[1], reverse = True)
    else:
        return []
示例#10
0
def nel_title_suggest(page_title, auto_suggest=True):
    wikipage = get_wiki_page(page_title, auto_suggest)
    if wikipage is not None:
        # check if indexed
        id = get_id_of_corresponding_algorithm(page_title, page_title)
        if id is None:
            # try to index this algorithm
            id = index_corresponding_algorithm(wikipage, page_title,
                                               page_title)
        if id is not None:
            rd.hset('rosetta-mapping-success', page_title, json.dumps([id]))
            rd.sadd('rosetta-mapping-success-wikipedia-autosuggest',
                    page_title)
            safe_print(id)
            print '--second'
            return [id]
def nel_title_suggest(page_title, auto_suggest=True):
    wikipage = get_wiki_page(page_title, auto_suggest)
    if wikipage is not None:
        # check if indexed
        id = get_id_of_corresponding_algorithm(page_title, page_title)
        if id is None:
            # try to index this algorithm
            id = index_corresponding_algorithm(wikipage, page_title,
                page_title)
        if id is not None:
            rd.hset('rosetta-mapping-success', page_title,
                json.dumps([id]))
            rd.sadd('rosetta-mapping-success-wikipedia-autosuggest',
                page_title)
            safe_print(id)
            print '--second'
            return [id]
示例#12
0
def nel_wikilinks_match_all(wikilinks, page_title):
    ids = list()
    for link in wikilinks:
        wikipage = get_wiki_page(link)
        if wikipage is not None and is_algorithm_page(wikipage):
            # check if indexed
            id = get_id_of_corresponding_algorithm(link, page_title)
            if id is None:
                # try to index this algorithm
                id = index_corresponding_algorithm(wikipage, link, page_title)
                if id is None:
                    continue
            ids.append(id)
    if len(ids) > 0:
        rd.hset('rosetta-mapping-success', page_title, json.dumps(ids))
        rd.sadd('rosetta-mapping-success-all-algo-links', page_title)
        safe_print(ids)
        print '--all-link'

    return ids
示例#13
0
def get_corres_wikipedia_algo_id(page):
    wikilinks = [
        linktitle for (linksite, linktitle) in list(page.iwlinks())
        if linksite == 'wp'
    ]

    # first, try wikilinks that has titles similar to the task name,
    # these links are sorted by confidence of fuzzy matching
    id = nel_wikilinks_fuzzy(wikilinks, page.page_title)
    if id is not None:
        return [id]

    # then use wikipedia api's auto-suggest to find corresponding
    # wikipedia page
    id = nel_title_suggest(page.page_title, False)
    if id is not None:
        return [id]

    # # then use elasticsearch fuzzy match task to indexed algorithm
    # # check if indexed
    # id = nel_title_elasticsearch(page.page_title)
    # if id is not None:
    #     return id

    # then, use crosswikis dictionary to get the most possible wiki link
    id = nel_title_crosswikis(page.page_title)
    if id is not None:
        return [id]

    # # finally, if none of the links is similar to the task name,
    # # 1, store the task description
    # # 2, relate the implementation with ALL wiki algorithms pages
    # #    mentioned in description
    # ids = nel_wikilinks_match_all(wikilinks, page.page_title)
    # if len(ids) > 0:
    #     return ids

    rd.sadd('rosetta-mapping-error-undefinable-wikilinks', page.page_title)
    print ''
    return None
def nel_wikilinks_match_all(wikilinks, page_title):
    ids = list()
    for link in wikilinks:
        wikipage = get_wiki_page(link)
        if wikipage is not None and is_algorithm_page(wikipage):
            # check if indexed
            id = get_id_of_corresponding_algorithm(link, page_title)
            if id is None:
                # try to index this algorithm
                id = index_corresponding_algorithm(wikipage, link,
                    page_title)
                if id is None:
                    continue
            ids.append(id)
    if len(ids) > 0:
        rd.hset('rosetta-mapping-success', page_title,
            json.dumps(ids))
        rd.sadd('rosetta-mapping-success-all-algo-links', page_title)
        safe_print(ids)
        print '--all-link'

    return ids
def get_corres_wikipedia_algo_id(page):
    wikilinks = [linktitle
        for (linksite, linktitle) in list(page.iwlinks())
        if linksite == 'wp']

    # first, try wikilinks that has titles similar to the task name,
    # these links are sorted by confidence of fuzzy matching
    id = nel_wikilinks_fuzzy(wikilinks, page.page_title)
    if id is not None:
        return [id]

    # then use wikipedia api's auto-suggest to find corresponding
    # wikipedia page
    id = nel_title_suggest(page.page_title, False)
    if id is not None:
        return [id]

    # # then use elasticsearch fuzzy match task to indexed algorithm
    # # check if indexed
    # id = nel_title_elasticsearch(page.page_title)
    # if id is not None:
    #     return id

    # then, use crosswikis dictionary to get the most possible wiki link
    id = nel_title_crosswikis(page.page_title)
    if id is not None:
        return [id]

    # # finally, if none of the links is similar to the task name,
    # # 1, store the task description
    # # 2, relate the implementation with ALL wiki algorithms pages
    # #    mentioned in description
    # ids = nel_wikilinks_match_all(wikilinks, page.page_title)
    # if len(ids) > 0:
    #     return ids

    rd.sadd('rosetta-mapping-error-undefinable-wikilinks', page.page_title)
    print ''
    return None
示例#16
0
def nel_title_crosswikis(page_title):
    query = "SELECT cprob, entity FROM queries WHERE anchor = %s"
    suggested_wikilinks = list(session.execute(query, [page_title]))
    suggested_wikilinks = sorted(suggested_wikilinks, key=lambda tup: tup[0])
    if len(suggested_wikilinks) > 0:
        # get the most confident link
        toplink = suggested_wikilinks[0][1]
        wikipage = get_wiki_page(toplink.replace('_', ' '))
        if wikipage is not None:
            # check if indexed
            id = get_id_of_corresponding_algorithm(toplink, page_title)
            if id is None:
                # try to index this algorithm
                id = index_corresponding_algorithm(wikipage, toplink,
                                                   page_title)
            if id is not None:
                rd.hset('rosetta-mapping-success', page_title,
                        json.dumps([id]))
                rd.sadd('rosetta-mapping-success-crosswikis', page_title)
                safe_print(id)
                print '--third'
                return [id]
def nel_title_crosswikis(page_title):
    query = "SELECT cprob, entity FROM queries WHERE anchor = %s"
    suggested_wikilinks = list(session.execute(query, [page_title]))
    suggested_wikilinks = sorted(suggested_wikilinks,
        key=lambda tup: tup[0])
    if len(suggested_wikilinks) > 0:
        # get the most confident link
        toplink = suggested_wikilinks[0][1]
        wikipage = get_wiki_page(toplink.replace('_', ' '))
        if wikipage is not None:
            # check if indexed
            id = get_id_of_corresponding_algorithm(toplink, page_title)
            if id is None:
                # try to index this algorithm
                id = index_corresponding_algorithm(wikipage, toplink,
                    page_title)
            if id is not None:
                rd.hset('rosetta-mapping-success', page_title,
                    json.dumps([id]))
                rd.sadd('rosetta-mapping-success-crosswikis',
                    page_title)
                safe_print(id)
                print '--third'
                return [id]
示例#18
0
def nel_wikilinks_fuzzy(wikilinks, page_title):
    if len(wikilinks) == 0:
        # no any wiki links
        rd.sadd('rosetta-mapping-error-no-wiki-links', page_title)
    else:
        # first, try wikilinks that has titles similar to the task name,
        # these links are sorted by confidence of fuzzy matching
        for link in get_sorted_similar_links(page_title, wikilinks):
            # check if indexed
            id = get_id_of_corresponding_algorithm(link, page_title)
            if id is None:
                # try to index this algorithm
                wikipage = get_wiki_page(link)
                if wikipage is not None:
                    id = index_corresponding_algorithm(wikipage, link,
                                                       page_title)
                safe_print(id)
            if id is not None:
                rd.hset('rosetta-mapping-success', page_title,
                        json.dumps([id]))
                rd.sadd('rosetta-mapping-similars-success', page_title)
                safe_print(id)
                print '--first'
                return [id]
def nel_wikilinks_fuzzy(wikilinks, page_title):
    if len(wikilinks) == 0:
        # no any wiki links
        rd.sadd('rosetta-mapping-error-no-wiki-links', page_title)
    else:
        # first, try wikilinks that has titles similar to the task name,
        # these links are sorted by confidence of fuzzy matching
        for link in get_sorted_similar_links(page_title, wikilinks):
            # check if indexed
            id = get_id_of_corresponding_algorithm(link, page_title)
            if id is None:
                # try to index this algorithm
                wikipage = get_wiki_page(link)
                if wikipage is not None:
                    id = index_corresponding_algorithm(wikipage, link,
                        page_title)
                safe_print(id)
            if id is not None:
                rd.hset('rosetta-mapping-success', page_title,
                    json.dumps([id]))
                rd.sadd('rosetta-mapping-similars-success', page_title)
                safe_print(id)
                print '--first'
                return [id]
示例#20
0
def get_corres_wikipedia_algo_id(page):
    wikilinks = [linktitle
        for (linksite, linktitle) in list(page.iwlinks())
        if linksite == 'wp']

    if len(wikilinks) == 0:
        # no any wiki links
        rd.sadd('rosetta-mapping-error-no-wiki-links', page.page_title)
        return None

    # first, try wikilinks that has titles similar to the task name,
    # these links are sorted by confidence of fuzzy matching
    for link in get_sorted_similar_links(page.page_title, wikilinks):
        # check if indexed
        id = get_id_of_corresponding_algorithm(link, page.page_title)
        if id is None:
            # try to index this algorithm
            wikipage = get_wiki_page(link)
            id = index_corresponding_algorithm(wikipage, link, page.page_title)
            if id is None:
                continue

        rd.hset('rosetta-mapping-success', page.page_title,
            json.dumps([id]))
        rd.sadd('rosetta-mapping-similars-success', page.page_title)
        return [id]

    # then, if none of the links is similar to the task name,
    # 1, store the task description
    # 2, relate the implementation with ALL wiki algorithms pages
    #    mentioned in description
    ids = list()
    for link in wikilinks:
        wikipage = get_wiki_page(link)
        if wikipage is not None and is_algorithm_page(wikipage):
            # check if indexed
            id = get_id_of_corresponding_algorithm(link, page.page_title)
            if id is None:
                # try to index this algorithm
                wikipage = get_wiki_page(link)
                id = index_corresponding_algorithm(wikipage, link,
                    page.page_title)
                if id is None:
                    continue
            ids.append(id)
    if len(ids) > 0:
        rd.hset('rosetta-mapping-success', page.page_title,
            json.dumps(ids))
        return ids

    rd.sadd('rosetta-mapping-error-undefinable-wikilinks', page.page_title)
    return None