Python is_algorithm_page示例，parseWikipedia.is_algorithm_page Python示例

示例#1

0

显示文件

文件： index_elasticsearch_rosetta.py 项目： xkxx/algodb

def get_corres_wikipedia_algo_id(page):
    wikilinks = [linktitle
        for (linksite, linktitle) in list(page.iwlinks())
        if linksite == 'wp']

    if len(wikilinks) == 0:
        # no any wiki links
        rd.sadd('rosetta-mapping-error-no-wiki-links', page.page_title)
        return None

    # first, try wikilinks that has titles similar to the task name,
    # these links are sorted by confidence of fuzzy matching
    for link in get_sorted_similar_links(page.page_title, wikilinks):
        # check if indexed
        id = get_id_of_corresponding_algorithm(link, page.page_title)
        if id is None:
            # try to index this algorithm
            wikipage = get_wiki_page(link)
            id = index_corresponding_algorithm(wikipage, link, page.page_title)
            if id is None:
                continue

        rd.hset('rosetta-mapping-success', page.page_title,
            json.dumps([id]))
        rd.sadd('rosetta-mapping-similars-success', page.page_title)
        return [id]

    # then, if none of the links is similar to the task name,
    # 1, store the task description
    # 2, relate the implementation with ALL wiki algorithms pages
    #    mentioned in description
    ids = list()
    for link in wikilinks:
        wikipage = get_wiki_page(link)
        if wikipage is not None and is_algorithm_page(wikipage):
            # check if indexed
            id = get_id_of_corresponding_algorithm(link, page.page_title)
            if id is None:
                # try to index this algorithm
                wikipage = get_wiki_page(link)
                id = index_corresponding_algorithm(wikipage, link,
                    page.page_title)
                if id is None:
                    continue
            ids.append(id)
    if len(ids) > 0:
        rd.hset('rosetta-mapping-success', page.page_title,
            json.dumps(ids))
        return ids

    rd.sadd('rosetta-mapping-error-undefinable-wikilinks', page.page_title)
    return None

示例#2

0

显示文件

def index_wiki_page(title, depth, visited):
    print 'looking at page %s, at depth %d:' % (title, depth)

    algo_id = -1
    cate_id = -1

    if title in visited and not UPDATING_WIKI:
        # don't need to revisit any page if we are not updating for new algos
        print 'visited'
        return get_ids_of_visited_wiki_page(title)

    if pw.is_category_title(title):  # is category page
        if depth < pw.MAX_CATEGORY_DEPTH:
            page = wiki.categorypage(title)
            if page is None:
                print '-> category not found'
                mark_visited(title, visited)
                return (algo_id, cate_id)
            print '-> category'
            child_algo_ids = list()
            child_cate_ids = list()
            for member in page.categorymembers:
                (child_algo_id,
                 child_cate_id) = index_wiki_page(member, depth + 1, visited)
                if child_algo_id != -1:
                    child_algo_ids.append(child_algo_id)
                if child_cate_id != -1:
                    child_cate_ids.append(child_cate_id)

            if len(child_algo_ids) == 0 and len(child_cate_ids) == 0:
                # if not algorithm category, igore
                mark_visited(title, visited)
                return (-1, -1)
            # add self to category table, and update cate_id
            cate_id = index_wiki_category_entry(page, child_algo_ids,
                                                child_cate_ids)
    else:  # is member page
        page = pw.get_wiki_page(title)
        if page is None:
            print '-> member page not found'
            mark_visited(title, visited)
            return (algo_id, cate_id)
        if pw.is_algorithm_page(page):
            print '-> algorithm page'
            # add this algorithm to algorithm table
            algo_id = index_wiki_algorithm_entry(page, title, visited)
        else:
            print '-> member page of other stuff'

    mark_visited(title, visited)
    return (algo_id, cate_id)

示例#3

0

显示文件

文件： index_elasticsearch_wikipedia.py 项目： geitje01/algodb

def index_wiki_page(title, depth, visited):
    print 'looking at page %s, at depth %d:' % (title, depth)

    algo_id = -1
    cate_id = -1

    if title in visited and not UPDATING_WIKI:
        # don't need to revisit any page if we are not updating for new algos
        print 'visited'
        return get_ids_of_visited_wiki_page(title)

    if pw.is_category_title(title):                    # is category page
        if depth < pw.MAX_CATEGORY_DEPTH:
            page = wiki.categorypage(title)
            if page is None:
                print '-> category not found'
                mark_visited(title, visited)
                return (algo_id, cate_id)
            print '-> category'
            child_algo_ids = list()
            child_cate_ids = list()
            for member in page.categorymembers:
                (child_algo_id, child_cate_id) = index_wiki_page(member,
                    depth + 1, visited)
                if child_algo_id != -1:
                    child_algo_ids.append(child_algo_id)
                if child_cate_id != -1:
                    child_cate_ids.append(child_cate_id)

            if len(child_algo_ids) == 0 and len(child_cate_ids) == 0:
                # if not algorithm category, igore
                mark_visited(title, visited)
                return (-1, -1)
            # add self to category table, and update cate_id
            cate_id = index_wiki_category_entry(page,
                child_algo_ids, child_cate_ids)
    else:                                               # is member page
        page = pw.get_wiki_page(title)
        if page is None:
            print '-> member page not found'
            mark_visited(title, visited)
            return (algo_id, cate_id)
        if pw.is_algorithm_page(page):
            print '-> algorithm page'
            # add this algorithm to algorithm table
            algo_id = index_wiki_algorithm_entry(page, title, visited)
        else:
            print '-> member page of other stuff'

    mark_visited(title, visited)
    return (algo_id, cate_id)

示例#4

0

显示文件

def nel_wikilinks_match_all(wikilinks, page_title):
    ids = list()
    for link in wikilinks:
        wikipage = get_wiki_page(link)
        if wikipage is not None and is_algorithm_page(wikipage):
            # check if indexed
            id = get_id_of_corresponding_algorithm(link, page_title)
            if id is None:
                # try to index this algorithm
                id = index_corresponding_algorithm(wikipage, link, page_title)
                if id is None:
                    continue
            ids.append(id)
    if len(ids) > 0:
        rd.hset('rosetta-mapping-success', page_title, json.dumps(ids))
        rd.sadd('rosetta-mapping-success-all-algo-links', page_title)
        safe_print(ids)
        print '--all-link'

    return ids

示例#5

0

显示文件

文件： index_elasticsearch_rosetta_using_crosswikis.py 项目： geitje01/algodb

def nel_wikilinks_match_all(wikilinks, page_title):
    ids = list()
    for link in wikilinks:
        wikipage = get_wiki_page(link)
        if wikipage is not None and is_algorithm_page(wikipage):
            # check if indexed
            id = get_id_of_corresponding_algorithm(link, page_title)
            if id is None:
                # try to index this algorithm
                id = index_corresponding_algorithm(wikipage, link,
                    page_title)
                if id is None:
                    continue
            ids.append(id)
    if len(ids) > 0:
        rd.hset('rosetta-mapping-success', page_title,
            json.dumps(ids))
        rd.sadd('rosetta-mapping-success-all-algo-links', page_title)
        safe_print(ids)
        print '--all-link'

    return ids