def __init__(self): self.tl_creator = TopicLinkManager() self.api_utils = WikiAPIUtils(self.desc_length, self.summ_length)
class WikipediaHelper(): desc_length = DEFAULT_DESC_LENGTH summ_length = DEFAULT_SUMM_LENGTH @classmethod def __init__(self): self.tl_creator = TopicLinkManager() self.api_utils = WikiAPIUtils(self.desc_length, self.summ_length) def get_article(self, r_args): if self.api_utils.is_seq(r_args) == False: r_args = [r_args] resources = self.api_utils.format_req(r_args) r_url = EXTRACT_URL.format(resources) retrieve_flag = "extract" response = self.api_utils.make_api_request(r_url, EXCONT, retrieve_flag) pages = response[PAGE_RESP_KEY] redirects = response[REDIRECT_RESP_KEY] if REDIRECT_RESP_KEY in response else None topics = self.api_utils.parse_and_save_topic(pages, redirects) for tp in topics: main_text = pages[str(tp.article_id)]['extract'].encode('utf-8') self.add_linked_topics(tp, main_text) # retrieve linked topics return topics def update_article(self, topic): tid = topic.article_id resources = self.api_utils.format_req(tid) r_url = EXTRACT_URL.format(resources) retrieve_flag = "extract" response = self.api_utils.make_api_request(r_url, EXCONT, retrieve_flag) pages = response[PAGE_RESP_KEY] main_text = pages[str(tid)]['extract'].encode('utf-8') self.add_linked_topics(topic, main_text) # retrieve linked topics return topic def add_linked_topics(self, source_topic, main_text): tid = source_topic.article_id num_linked_topics = DEFAULT_NUM_LINKED_TOPICS - len(source_topic.linked_topics) source_page = R_PAGEID.format(tid) r_url = LINKED_TOPICS_URL.format(source_page) retrieve_flag = "links" response = self.api_utils.make_api_request(r_url, PLCONT, retrieve_flag) pages = response[PAGE_RESP_KEY] linked_titles = self.api_utils.parse_linked_pages(tid, pages) source_topic.outlinks = linked_titles all_linked_topics = [] # for testing # return try: thread_count = 1 topic_desc_dict = {} threads = [] rel_topics = [] topic_desc_dict.update({source_topic.article_id: source_topic.description}) scored_outlinks_dict = score_outlinks(main_text, linked_titles) sorted_outlinks = sorted(scored_outlinks_dict.items(), key=lambda x:x[1], reverse=True) #TODO: comment out #for a in sorted_outlinks: # print a[0], "-->", a[1] #print "\n" top_links = sorted_outlinks[0:NUM_TOP_LINKS+1] most_relevant_links = [x[0] for x in top_links] # spun threads to create topic objects for i in range(1, MAX_NUM_THREADS + 1): tmp_thread = FuncThread(thread_count, self.spun_topic_creator, most_relevant_links, rel_topics, topic_desc_dict) tmp_thread.start() threads.append(tmp_thread) thread_count += 1 # wait for threads to return for t in threads: t.join() # calculate score of related topics scored_desc_dict = score_topics(source_topic.article_id, topic_desc_dict) # spun threads to create topic links del threads[:] topic_links = [] for i in range(1, MAX_NUM_THREADS + 1): tmp_thread = FuncThread(thread_count, self.spun_tlink_creator, source_topic, scored_desc_dict, rel_topics, topic_links) tmp_thread.start() threads.append(tmp_thread) thread_count += 1 # wait for threads to return for t in threads: t.join() sorted_tl = sorted(topic_links, key=lambda instance: instance.score, reverse=True) #TODO: comment out #print "PRINTING FROM SORTED DICT\n" #sorted_desc_outlinks = sorted(scored_desc_dict.items(), key=lambda x:x[1], reverse=True) #for a in sorted_desc_outlinks: # print a[0], "-->", a[1] #print "DONE\n" # added like this to prevent Index out of bounds error on sorted_tl if len(sorted_tl) < num_linked_topics: all_linked_topics.extend(sorted_tl) else: all_linked_topics.extend(sorted_tl[0:num_linked_topics + 1]) source_topic.linked_topics = all_linked_topics[0: num_linked_topics] source_topic.save() except Exception, E: print "Error: issue with threads" traceback.print_exc()
class WikipediaHelper(): desc_length = DEFAULT_DESC_LENGTH summ_length = DEFAULT_SUMM_LENGTH @classmethod def __init__(self): self.tl_creator = TopicLinkManager() self.api_utils = WikiAPIUtils(self.desc_length, self.summ_length) def get_article(self, r_args): if self.api_utils.is_seq(r_args) == False: r_args = [r_args] resources = self.api_utils.format_req(r_args) r_url = EXTRACT_URL.format(resources) retrieve_flag = "extract" response = self.api_utils.make_api_request(r_url, EXCONT, retrieve_flag) pages = response[PAGE_RESP_KEY] redirects = response[ REDIRECT_RESP_KEY] if REDIRECT_RESP_KEY in response else None topics = self.api_utils.parse_and_save_topic(pages, redirects) for tp in topics: main_text = pages[str(tp.article_id)]['extract'].encode('utf-8') self.add_linked_topics(tp, main_text) # retrieve linked topics return topics def update_article(self, topic): tid = topic.article_id resources = self.api_utils.format_req(tid) r_url = EXTRACT_URL.format(resources) retrieve_flag = "extract" response = self.api_utils.make_api_request(r_url, EXCONT, retrieve_flag) pages = response[PAGE_RESP_KEY] main_text = pages[str(tid)]['extract'].encode('utf-8') self.add_linked_topics(topic, main_text) # retrieve linked topics return topic def add_linked_topics(self, source_topic, main_text): tid = source_topic.article_id num_linked_topics = DEFAULT_NUM_LINKED_TOPICS - len( source_topic.linked_topics) source_page = R_PAGEID.format(tid) r_url = LINKED_TOPICS_URL.format(source_page) retrieve_flag = "links" response = self.api_utils.make_api_request(r_url, PLCONT, retrieve_flag) pages = response[PAGE_RESP_KEY] linked_titles = self.api_utils.parse_linked_pages(tid, pages) source_topic.outlinks = linked_titles all_linked_topics = [] # for testing # return try: thread_count = 1 topic_desc_dict = {} threads = [] rel_topics = [] topic_desc_dict.update( {source_topic.article_id: source_topic.description}) scored_outlinks_dict = score_outlinks(main_text, linked_titles) sorted_outlinks = sorted(scored_outlinks_dict.items(), key=lambda x: x[1], reverse=True) #TODO: comment out #for a in sorted_outlinks: # print a[0], "-->", a[1] #print "\n" top_links = sorted_outlinks[0:NUM_TOP_LINKS + 1] most_relevant_links = [x[0] for x in top_links] # spun threads to create topic objects for i in range(1, MAX_NUM_THREADS + 1): tmp_thread = FuncThread(thread_count, self.spun_topic_creator, most_relevant_links, rel_topics, topic_desc_dict) tmp_thread.start() threads.append(tmp_thread) thread_count += 1 # wait for threads to return for t in threads: t.join() # calculate score of related topics scored_desc_dict = score_topics(source_topic.article_id, topic_desc_dict) # spun threads to create topic links del threads[:] topic_links = [] for i in range(1, MAX_NUM_THREADS + 1): tmp_thread = FuncThread(thread_count, self.spun_tlink_creator, source_topic, scored_desc_dict, rel_topics, topic_links) tmp_thread.start() threads.append(tmp_thread) thread_count += 1 # wait for threads to return for t in threads: t.join() sorted_tl = sorted(topic_links, key=lambda instance: instance.score, reverse=True) #TODO: comment out #print "PRINTING FROM SORTED DICT\n" #sorted_desc_outlinks = sorted(scored_desc_dict.items(), key=lambda x:x[1], reverse=True) #for a in sorted_desc_outlinks: # print a[0], "-->", a[1] #print "DONE\n" # added like this to prevent Index out of bounds error on sorted_tl if len(sorted_tl) < num_linked_topics: all_linked_topics.extend(sorted_tl) else: all_linked_topics.extend(sorted_tl[0:num_linked_topics + 1]) source_topic.linked_topics = all_linked_topics[0:num_linked_topics] source_topic.save() except Exception, E: print "Error: issue with threads" traceback.print_exc()