def get_corres_wikipedia_algo_id(page): wikilinks = [linktitle for (linksite, linktitle) in list(page.iwlinks()) if linksite == 'wp'] if len(wikilinks) == 0: # no any wiki links rd.sadd('rosetta-mapping-error-no-wiki-links', page.page_title) return None # first, try wikilinks that has titles similar to the task name, # these links are sorted by confidence of fuzzy matching for link in get_sorted_similar_links(page.page_title, wikilinks): # check if indexed id = get_id_of_corresponding_algorithm(link, page.page_title) if id is None: # try to index this algorithm wikipage = get_wiki_page(link) id = index_corresponding_algorithm(wikipage, link, page.page_title) if id is None: continue rd.hset('rosetta-mapping-success', page.page_title, json.dumps([id])) rd.sadd('rosetta-mapping-similars-success', page.page_title) return [id] # then, if none of the links is similar to the task name, # 1, store the task description # 2, relate the implementation with ALL wiki algorithms pages # mentioned in description ids = list() for link in wikilinks: wikipage = get_wiki_page(link) if wikipage is not None and is_algorithm_page(wikipage): # check if indexed id = get_id_of_corresponding_algorithm(link, page.page_title) if id is None: # try to index this algorithm wikipage = get_wiki_page(link) id = index_corresponding_algorithm(wikipage, link, page.page_title) if id is None: continue ids.append(id) if len(ids) > 0: rd.hset('rosetta-mapping-success', page.page_title, json.dumps(ids)) return ids rd.sadd('rosetta-mapping-error-undefinable-wikilinks', page.page_title) return None
def index_wiki_page(title, depth, visited): print 'looking at page %s, at depth %d:' % (title, depth) algo_id = -1 cate_id = -1 if title in visited and not UPDATING_WIKI: # don't need to revisit any page if we are not updating for new algos print 'visited' return get_ids_of_visited_wiki_page(title) if pw.is_category_title(title): # is category page if depth < pw.MAX_CATEGORY_DEPTH: page = wiki.categorypage(title) if page is None: print '-> category not found' mark_visited(title, visited) return (algo_id, cate_id) print '-> category' child_algo_ids = list() child_cate_ids = list() for member in page.categorymembers: (child_algo_id, child_cate_id) = index_wiki_page(member, depth + 1, visited) if child_algo_id != -1: child_algo_ids.append(child_algo_id) if child_cate_id != -1: child_cate_ids.append(child_cate_id) if len(child_algo_ids) == 0 and len(child_cate_ids) == 0: # if not algorithm category, igore mark_visited(title, visited) return (-1, -1) # add self to category table, and update cate_id cate_id = index_wiki_category_entry(page, child_algo_ids, child_cate_ids) else: # is member page page = pw.get_wiki_page(title) if page is None: print '-> member page not found' mark_visited(title, visited) return (algo_id, cate_id) if pw.is_algorithm_page(page): print '-> algorithm page' # add this algorithm to algorithm table algo_id = index_wiki_algorithm_entry(page, title, visited) else: print '-> member page of other stuff' mark_visited(title, visited) return (algo_id, cate_id)
def nel_wikilinks_match_all(wikilinks, page_title): ids = list() for link in wikilinks: wikipage = get_wiki_page(link) if wikipage is not None and is_algorithm_page(wikipage): # check if indexed id = get_id_of_corresponding_algorithm(link, page_title) if id is None: # try to index this algorithm id = index_corresponding_algorithm(wikipage, link, page_title) if id is None: continue ids.append(id) if len(ids) > 0: rd.hset('rosetta-mapping-success', page_title, json.dumps(ids)) rd.sadd('rosetta-mapping-success-all-algo-links', page_title) safe_print(ids) print '--all-link' return ids