def info(topic): response = {} response["type"] = "wiki" try: page = wikipedia.page(topic) response['title'] = page.title response['url'] = page.url response['content'] = wikipedia.summary(page.title,sentences = 5) if len(response['content']) < 200: response['content'] = wikipedia.summary(page.title,sentences = 10) except Exception as error: ename = type(error).__name__ if ename == 'DisambiguationError': page = wikipedia.page(error.options[0]) response['title'] = page.title response['url'] = page.url response['content'] = wikipedia.summary(page.title,sentences = 2) if len(response['content']) < 200: response['content'] = wikipedia.summary(page.title,sentences = 10) elif ename == 'HTTPTimeoutError': response['type'] = "error" response['error'] = "I couldn't reach wikipedia" elif ename == 'PageError': response['type'] = "error" response['error'] = "I couldn't find anything on wikipedia" else: response['type'] = "error" response['error'] = "Unknown error occured while reaching wikipedia" return response
def searchWiki(page): wikipedia.set_lang("fr") link = '' try: # p = wikipedia.page(page) # link = p.url propos = wikipedia.search(page,results=5,suggestion=False) for choice in propos: if choice.encode('utf-8') == page.encode('utf-8'): p = wikipedia.page(page) link = p.url break elif page in choice: #TODO print 'There is a proposition containing the keyWord ' print choice else: try: wikipedia.page(page,redirect=False,auto_suggest=False) except wikipedia.exceptions.RedirectError: p = wikipedia.page(page) link = p.url break except: link ='' except: link = "" return link#.encode('utf-8')
def findRelevantArticles(term,data_path='.'): articleList = [] articles = wikipedia.search(term) #Setting suggestion = False (default value); No clear use for it now for article in articles: try: article = wikipedia.page(article) category_keywords = set(list(itertools.chain.from_iterable([category.lower().split() for category in article.categories]))) if len(category_keywords & relevant_categories) > 0: articlefilename = "content_"+str(article.title.lower())+".txt" if os.path.isfile(articlefilename): articlefilename = "content_"+ str(article.title.lower())+'%s.txt' % str(term+time.strftime("%Y%m%d-%H%M%S")) with codecs.open(os.path.join(data_path,articlefilename),'wb', 'utf-8') as outfile: content = wikipedia.page(article).content print>>outfile,content articleList.append(str(article.title)) except wikipedia.exceptions.PageError as e: pass except wikipedia.exceptions.DisambiguationError as e: for article in e.options: try: article = wikipedia.page(article) category_keywords = set(list(itertools.chain.from_iterable([category.lower().split() for category in article.categories]))) if len(category_keywords & relevant_categories) > 0: articlefilename = "content_"+str(article.title.lower())+".txt" if os.path.isfile(articlefilename): articlefilename = "content_"+ str(article.title.lower())+'%s.txt' % str(term+time.strftime("%Y%m%d-%H%M%S")) with codecs.open(os.path.join(data_path,articlefilename),'wb','utf-8') as outfile: print>>outfile,article.content articleList.append(str(article.title)) except wikipedia.exceptions.DisambiguationError as f: pass
def wikify(): """Returns the sentences with wikipedia links""" tag_dict = look_entity() link_dict = {} combined = combine() for item in tag_dict.keys(): if item in combined.keys(): try: if combined[item] in wikipedia.page(combined[item]).content: link_dict[item] = wikipedia.page(combined[item]).url except wikipedia.exceptions.DisambiguationError as disamb: try: link_dict[item] = wikipedia.page(disamb.options[0]).url except: pass except wikipedia.exceptions.PageError: pass else: try: link_dict[item] = wikipedia.page(item).url except wikipedia.exceptions.DisambiguationError as disamb: try: link_dict[item] = wikipedia.page(disamb.options[0]).url except: pass except wikipedia.exceptions.PageError: pass return link_dict
def article(self, pageid=None, title=None): """ Returns a specific article from Wikipedia, given its pageid or its title. Downloads it if necessary """ if pageid is None and title is None: raise Exception('Pageid and title can\'t be None at the same time') if pageid is None: d = self.db.articles.find_one({'title': title}) if d is not None: return d # found it else: d = self.db.articles.find_one({'_id': pageid}) if d is not None: return d # found it try: if not(pageid is None): page = wikipedia.page(pageid=pageid) else: page = wikipedia.page(title=title) except ( wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError, wikipedia.exceptions.WikipediaException, requests.exceptions.RequestException, ValueError # error decoding JSON response ): return try: time.sleep(0.5) except: time.wait(0.5) # Even if we didn't find pageid or title, it still could be in the DB # since the title could have changed try: d = { '_id': int(page.pageid), 'title': page.title, 'content': page.content } except KeyboardInterrupt: # filter KeyboardInterrupt from here raise except Exception: return # can't add this entry self.db.articles.update_one( {'_id': d['_id']}, {'$set': d}, upsert=True ) return d
def wiki(event, bot): """ wiki \x02searchterm\x02. Will search Wikipedia for \x02searchterm\x02. """ if not event.argument: return bot.say(functionHelp(wiki)) result = search(event.argument, results=1, suggestion=True) if not result[0]: if result[1]: return bot.say("No results found. Did you mean \x02%s\x02?" % result[1]) else: return bot.say("No results found.") errors = [] attempt = 0 p = None try: p = page(result[0]) # use preload=True when it's fixed: https://github.com/goldsmith/Wikipedia/issues/78 except DisambiguationError as e: errors.append("Random disambig page: ") while attempt < 3: try: p = page(choice(e.options)) except DisambiguationError: pass attempt += 1 if not p: return bot.say("Gave up looking for disambiguous entry from disambiguous page.") if result[1]: errors.append("(SP: %s?) " % result[1]) content = p.content[:800].replace("\n", " ").replace("====", "").replace("===", "").replace("==", "") bot.say(RESULT_RPL % ("".join(errors), p.url), strins=[p.title, content], fcfs=True)
def getContentFromLink(link): try: linkText = wk.page(link, auto_suggest=False).content.lower() except wk.exceptions.DisambiguationError as e: options = filter(lambda x: "(disambiguation)" not in x, e.options) linkText = wk.page(options[0], auto_suggest=False).content.lower() return linkText
def disambiguationWikipedia(noun): """ Disambiguation for Wikipedia errors """ # Try to get wikipedia content try: wiki = wikipedia.page(noun) except wikipedia.exceptions.DisambiguationError as e: new = e.options[0] try: wiki = wikipedia.page(new) except: return 'Null' except wikipedia.exceptions.PageError: new = wikipedia.search(noun) try: wiki = wikipedia.page(new[0]) except: return 'Null' except: return 'Null' return wiki
def search_wikipedia(word): searchArr = wikipedia.search(word) wiki_results = [] try: try: for result in searchArr: #print("result: " + result) wiki_results.append(wikipedia.page(result, preload=False)) except wikipedia.DisambiguationError as e: #print("disambiguation error on " + result) #print(e.with_traceback) try: for item in e.options: #print("disambiguation error on " + item) wiki_results.append(wikipedia.page(item, preload=False)) except wikipedia.DisambiguationError as i: try: for item in i.options: #print("disambiguation error on " + item) wiki_results.append(wikipedia.page(item, preload=False)) except wikipedia.DisambiguationError: pass except: print("Something went wrong getting wikipedia results") pass return wiki_results
def page(title=None, pageid=None, auto_suggest=True, redirect=True): """ The search term from user may not corresponds to a wikipedia page, due to vagueness. There are 2 alternatives, "redirect"/ "disambiguous". :param auto_suggest:let Wikipedia find a valid page title for the query :return: """ if pageid is not None: pageid = int(pageid) page = WikipediaArticle.objects(pageid=pageid) else: page = WikipediaArticle.objects(title=title) if not page: results, suggestion = WikipediaWrapper.search( title, results=1, suggestion=True) suggested_term = suggestion or results[0] page = WikipediaArticle.objects(title=suggested_term) if page: page = page[0] else: try: page = wikipedia.page(title=title, pageid=pageid, auto_suggest=auto_suggest, redirect=redirect) except UnicodeDecodeError: page = wikipedia.page(title=str_util.normal_str(title), pageid=pageid, auto_suggest=auto_suggest, redirect=redirect) if type(page) is wikipedia.WikipediaPage: page = WikipediaWrapper.save_page(page) return page
def climb_tree(self): """Climb the tree""" branch_found = True cur_branch = self.seed prev_node = None while cur_branch is not None: self.logger.debug('Current branch is %s'%cur_branch) #Get wikipedia page try: cur_page = wikipedia.page(cur_branch) except wikipedia.PageError: self.logger.exception('Cannot find page for %s. Ending search.'%cur_branch) self.tree.node(cur_branch) self.tree.edge(cur_branch,prev_node) cur_branch = None continue except wikipedia.DisambiguationError: self.logger.exception('Multiple pages found for query %s. Adding "(physicist)" and searching again.') cur_page = wikipedia.page(cur_branch+' (physicist)') #parse the table html_source = BeautifulSoup(cur_page.html(),'html.parser') advisor = self._search_info_table(html_source,['Doctoral advisor','Doctoral advisors','Academic advisors','Academic advisor']) alma_mater = self._search_info_table(html_source,'Alma mater') students = self._search_info_table(html_source,'Doctoral students') #add to graph self.tree.node(cur_branch,cur_branch+'\n'+self._none_filter(alma_mater)) if prev_node is not None: self.tree.edge(cur_branch,prev_node) #update prev_node = cur_branch cur_branch = self._res_filter(advisor)
def link_checker(ngram): ''' Checks if the word gives a valid wikipedia link ''' try: page = wikipedia.page(ngram) link = page.url return link except wikipedia.exceptions.DisambiguationError: #link = ngram.split(" ") #newlink = "_".join(ngram) link = 'http://en.wikipedia.org/wiki/' + ngram + '_(disambiguation)' return link except wikipedia.exceptions.PageError: wordlist = ngram.split() counter = 0 for word in wordlist: word.lower() if word in ["prime","minister","president"]: wordlist.pop(counter) counter += 1 ngram.join(wordlist) try: page = wikipedia.page(ngram) link = page.url return link except wikipedia.exceptions.PageError: return -1 except wikipedia.exceptions.DisambiguationError: return -1
def collectFrom(lang,start,hangang): wikipedia.set_lang(lang) lookpa = wikipedia.page(start).links lookna = [wikipedia.page(start)] corpus = str(wikipedia.page(start).content) while len(corpus) < hangang: random.shuffle(lookpa) item = lookpa[0] try: corpus += str(wikipedia.page(item).content) except wikipedia.exceptions.PageError: pass except wikipedia.exceptions.DisambiguationError: pass except KeyError: pass lookna.append(item) lookpa.remove(item) try: for page in wikipedia.page(item).links: if page not in lookpa: if page not in lookna: lookpa.append(page) except wikipedia.exceptions.PageError: pass except wikipedia.exceptions.DisambiguationError: pass except KeyError: pass print('Corpus = ' + str(len(corpus)) + ' Searched = ' + str(len(lookna)) + ' Still = ' + str(len(lookpa))) f = open(lang + 'FromWikiCorp.txt', 'w') f.write(corpus) f.close()
def context_data(self): """ Gather data from Wikipedia based on user-inputed SUBJECT. """ text_list, visited, visitedSeeAlso, queue = [], set(), set(), list() queue.append((self.subject, self.depth)) while len(queue) > 0: print("Hi") next = queue.pop(0) try: if next[0] not in visited and next[1] >= 0: visited.add(next[0]) results = wikipedia.search(next[0], self.max_searches, False) for pagename in results: queue.append((pagename, next[1]-1)) text_list.extend(wikipedia.page(next[0]).content.split()) except: pass queue.append((self.subject, self.depth)) while len(queue) > 0: next = queue.pop(0) try: if next[0] not in visitedSeeAlso and next[1] >= 0: visitedSeeAlso.add(next[0]) page = wikipedia.page(next[0]) for reference in page.section("See also").splitlines(): queue.append((reference, next[1] -1)) text_list.extend(wikipedia.page(next[0]).content.split()) except: pass return text_list
def getWikiPage(title): try: page = wikipedia.page(title) except wikipedia.exceptions.DisambiguationError as e: print(e.options) title = random.choice(e.options) page = wikipedia.page(title) return page
def test_tax_holiday(self): print("------ tax holiday test ------------") web_bot = wikipedia.page("Web Bot").title tax_holiday = wikipedia.page("Tax holiday").title article1 = Article(web_bot, repo=DataDict()) article2 = Article(tax_holiday, op=op_backlinks, repo=DataDict()) run_test(article1, article2) print('=========================================')
def test_impeachment(self): print("------ Impeachment test ------------") impeachment = wikipedia.page("Impeachment").title tower = wikipedia.page("Trump Tower").title article1 = Article(impeachment, repo=DataDict()) article2 = Article(tower, op=op_backlinks, repo=DataDict()) run_test(article1, article2) print('=========================================')
def wikipediaSearch(self): LOGGER.info('Querying Wikipedia') neighborhood = False if self.location['neighborhood'] != '': neighborhood = True searchTerm = self.location['neighborhood'] + ' ' + self.location['city'] elif self.location['neighborhood'] == '' and self.location['city'] != '' and self.location['region'] != '': searchTerm = self.location['city'] + ' ' + self.location['region'] elif self.location['place_name'] != '': searchTerm = self.location['place_name'] LOGGER.info('WIKI SEARCH TERM: ' + searchTerm) wikiPages = list() try: LOGGER.info('trying first wiki query') results = wikipedia.search(searchTerm) if len(results) != 0: if len(results) >= 3: results = results[:3] for result in results: try: page = wikipedia.page(result) wikiPages.append(page.content) except wikipedia.exceptions.DisambiguationError as e: pass #print 'Disambiguation Error' #print e except wikipedia.exceptions.PageError as e: #print 'Page Error' #print e pass except wikipedia.exceptions.DisambiguationError as e: #print 'DISAMBIGUATION ERROR' #print e.options if len(e.options) !=0: if len(e.options) >= 3: e.options = e.options[:3] for opt in e.options: try: page = wikipedia.page(opt) wikiPages.append(page.content) except wikipedia.exceptions.DisambiguationError as e: #print 'Disambiguation Error' #print e pass except wikipedia.exceptions.PageError as e: #print 'Page Error' #print e pass allText = '' if len(wikiPages) != 0: for page in wikiPages: allText += page self.results['wikipedia'] = allText
def getText(): commonHeaders = [] popularity = [] yourArticle = wikipedia.page("Obama") articles = [] articles.append(wikipedia.search("American politicans")) articles.append(wikipedia.search("American presidents")) articles.append(wikipedia.search("Hillary Clinton")) articles.append(wikipedia.search("Bill Clinton")) articles.append(wikipedia.search("George Washington")) articles.append(wikipedia.search("John Kerry")) #articles.append(wikipedia.search("John F. Kennedy")) ## yourArticle = wikipedia.page("New York") ## articles = wikipedia.search("New York") ## articles.append(wikipedia.search("American cities")) ## articles.append(wikipedia.search("Boston")) ## articles.append(wikipedia.search("Paris")) ## articles.append(wikipedia.search("San Francisco")) ## articles.append(wikipedia.search("Sacramento")) ## articles.append(wikipedia.search("Seattle")) ## articles.append(wikipedia.search("Chicago")) ## articles.append(wikipedia.search("St. Louis")) ## articles.append(wikipedia.search("Las Vegas")) ## articles.append(wikipedia.search("Hartford")) ## articles.append(wikipedia.search("Trenton, NJ")) ## articles.append(wikipedia.search("Washington D.C.")) ## articles.append(wikipedia.search("Boise")) ## articles.append(wikipedia.search("Detroit")) ## articles.append(wikipedia.search("Now Orleans")) ## articles.append(wikipedia.search("Salt Lake City")) for i in articles: article = wikipedia.page(i).content headers = getSectionHeaders(article) for x in headers: if x not in commonHeaders: commonHeaders.append(x) popularity.append(1) else: assert(len(popularity) > 1) popularity[commonHeaders.index(x)] += 1 print x print commonHeaders x = 0 while (x < len(commonHeaders)): if (popularity[x]>1): print commonHeaders[x] print popularity[x] x = x + 1 # Figure out what kind of article this is # We can use the categories tag, if you've created it yourCategories = yourArticle.categories for category in yourCategories: #print category break return
def __init__(self,seed,sibling_depth=100): """Constructor for TreeClimber class""" self.logger = logging.getLogger(type(self).__name__) try: wikipedia.page(seed) self.seed = seed except wikipedia.PageError: self.logger.exception('Cannot find Wikipedia page for %s. Try another starting point.'%seed) self.tree = Digraph('G')
def new_article(new_title, old_title): # Function takes 2 arguments (current page title and old page title). This is incase of disambiguation and the user needs to choose another link. try: new_page = wikipedia.page(new_title) show_links(new_page) # Deals with wiki disambiguation error except wikipedia.exceptions.DisambiguationError: print("An error occurred (due to disambiguation), please choose another link.") show_links(wikipedia.page(old_title))
def createTestData(self, articleList): for article in articleList: articleFile = open(article + "_test", 'w+') article_nouns = open(article + "_nouns", 'w+') articleContentTwo = TextBlob(wikipedia.page(article).content.encode('UTF-8')) articleContent = wikipedia.page(article).content.encode('UTF-8') nouns = articleContentTwo.noun_phrases self.get_nouns(nouns, article_nouns) articleFile.write(articleContent) articleFile.close
def getWikiText(query): results = wikipedia.search(query) try: page = wikipedia.page(title=results[0], auto_suggest=False) except wikipedia.DisambiguationError as e: page = wikipedia.page(e.options[0]) text = page.content.encode('ascii', 'ignore') cleanedText = text.translate(None, string.punctuation + digits).lower() allWords = cleanedText.split() return allWords
def __init__(self, page): """ initializes self.page to the correct wikipedia resource """ try: self.page = wikipedia.page(page) except wikipedia.exceptions.DisambiguationError as e: self.page = wikipedia.page(e.options[0]) self.soup = BeautifulSoup(self.page.html()) self._gen_table()
def load_wiki(db): # load up desired terms from csv file med_terms = [] with open("snomed_cleaned_term.txt", "rb") as f: text = f.readlines() for line in text: med_terms.extend(line.split(",")) con = db.connection() cur = db.cursor() missed = 0 i = 0 for term in med_terms: # look in wikipedia for page associated with term try: page = wikipedia.page(term) except wikipedia.exceptions.DisambiguationError as e: # handle disambiguation error e.options = [t.encode("utf-8") for t in e.options] # prioritize choice if it has "medic" in title, grabs things like "(medicine)" possible = [x for x in e.options if re.search("medic", x.lower())] if possible: try: page = wikipedia.page(possible[0]) except: missed += 1 continue # otherwise take the first choice of term else: try: page = wikipedia.page(e.options[0]) except: missed += 1 continue # if can't find any pages, skip term except wikipedia.exceptions.PageError: missed += 1 continue # join all the categories by "," to make it a string for input into sql try: categories = ",".join(page.categories) except: categories = "" # insert page into sql table cur.execute( "insert into med_pages VALUES (%s, %s, %s, %s, %s)", (int(page.pageid), page.title, page.summary, categories, page.content), ) i += 1 con.commit() print "# unidentifiable pages:", missed print "Inserted:", i
def main(): parser = argparse.ArgumentParser( description='Toma un tema de la Wikipedia y lo twitea como hilo') parser.add_argument('-c', action='store_true', help='Loguearse como el último usuario utilizado') parser.add_argument('-s', action='store_true', help='Postear no solo el resumen de Wikipedia sino ' + 'también sus secciones') args = parser.parse_args() use_last_creds = args.c post_sections = args.s wikipedia.set_lang('es') name = input('¿De qué quieres tirarte el pisto?: ') search_results = wikipedia.search(name) if len(search_results) > 1: result_str = '' for i, e in enumerate(search_results): result_str += '[{}] {}\n'.format(i+1, e) print() option = input('Sé más preciso:\n' + result_str + '\nNúmero de opción: ') page = wikipedia.page(search_results[int(option)-1]) elif len(search_results) == 1: page = wikipedia.page(search_results[0]) else: print('Lo siento, no hay nada para esa búsqueda :(') exit(0) # Store the page as a list of strings text = [u'No sé si conocéis {}. Abro hilo \U0001F447'.format(page.title)] text.extend([clean_string(i) for i in page.summary.splitlines()]) if post_sections: for section in page.sections: if section in IGNORED_SECTIONS: continue text.append('##{}'.format(section)) text.extend( [clean_string(i) for i in page.section(section).splitlines()]) # Split text into tweets tweets = split_text(text) print() twclient.post_as_thread(tweets, use_last_creds) print('¡Enhorabuena!' + '¡Ahora todos piensan que eres un experto en {}!'.format(page.title))
def getWikiURL(noun, tag): """ Get the Wikipedia URL """ if tag == "PERSON": try: wiki = wikipedia.page(noun) except wikipedia.exceptions.DisambiguationError as e: try: newNoun = e.options[0] newNoun2 = e.options[1] wiki = wikipedia.page(newNoun) wiki2 = wikipedia.page(newNoun2) firstSentence1 = wiki.content.split(".")[0] firstSentence2 = wiki2.content.split(".")[0] if "born" in firstSentence1: return wiki.url elif "born" in firstSentence2: return wiki2.url else: return "Null" except: return "Null" except wikipedia.exceptions.PageError: new = wikipedia.search(noun) try: wiki = wikipedia.page(new[0]) except: return 'Null' else: # Check for disambiguation on Wikipedia wiki = disambiguationWikipedia(noun) try: url = wiki.url except: return "Null" return url
def get_page_categories_unprotected(self, name, page_id=None): try: page = wiki.page(pageid=page_id) except (PageError, ValueError): page = wiki.page(name) if u'a' in page.categories: print page.title return self._afterprocess_categories(page.categories)
def get_text(self): try: # do a wikipedia search for the topic topic_results = wikipedia.search(self.topic) # pick one of the results and grab the content self.content += wikipedia.page(topic_results[rand(0, len(topic_results) - 1)]).content # DO IT MORE more_content = wikipedia.page(topic_results[rand(0, len(topic_results) - 1)]).content if more_content not in self.content: self.content += more_content more_content = wikipedia.page(topic_results[rand(0, len(topic_results) - 1)]).content if more_content not in self.content: self.content += more_content more_content = wikipedia.page(topic_results[rand(0, len(topic_results) - 1)]).content except wikipedia.exceptions.DisambiguationError as e: self.content += self.topic + self.uncertain except wikipedia.exceptions.PageError as e: self.content += self.topic + self.unknown # if there are more than one word in the topic try to get some more results with the first and last word if len(self.topic.split()) > 1: try: # get more results with less of the topic for some ambiguity topic_results = wikipedia.search(self.topic.split()[:1]) self.content += wikipedia.page(topic_results[rand(0, len(topic_results) - 1)]).content more_content = wikipedia.page(topic_results[rand(0, len(topic_results) - 1)]).content if more_content not in self.content: self.content += more_content except wikipedia.exceptions.DisambiguationError as e: self.content += self.topic + self.uncertain except wikipedia.exceptions.PageError as e: self.content += self.topic + self.unknown try: # get even more with the second half of the topic for wierd results maybe topic_results = wikipedia.search(self.topic.split()[-1:]) self.content += wikipedia.page(topic_results[rand(0, len(topic_results) - 1)]).content more_content = wikipedia.page(topic_results[rand(0, len(topic_results) - 1)]).content if more_content not in self.content: self.content += more_content except wikipedia.exceptions.DisambiguationError as e: self.content += self.topic + self.uncertain except wikipedia.exceptions.PageError as e: self.content += self.topic + self.unknown try: # do a wikipedia search for the topic topic_results = wikipedia.search(self.topic[:len(self.topic) / 2]) # pick one of the results and grab the self.content self.content += wikipedia.page(topic_results[rand(0, len(topic_results) - 1)]).content except wikipedia.exceptions.DisambiguationError as e: self.content += self.topic + self.uncertain except wikipedia.exceptions.PageError as e: self.content += self.topic + self.unknown return self.content.capitalize()
def verifyInput(prompt): while True: title = raw_input(prompt) try: wikipedia.page(title).load() except wikipedia.PageError: print "Not a page" continue except wikipedia.DisambiguationError: print "Don't use disambiguation pages" continue return title
import wikipedia term = input("Enter a title or search phrase") while term != "": try: print(wikipedia.page(term).title) print(wikipedia.page(term).summary) print(wikipedia.page(term).url) term = input("Enter a title or search phrase") except wikipedia.exceptions.DisambiguationError: print("There are multiple results. Please specify") print(wikipedia.search(term)) term = input("Enter a title or search phrase")
import wikipedia menu_input = str(input('Enter "s" for search, enter "p" for page title \n>>>')) while menu_input != '': if menu_input == 's': search_input = str(input('Search Request: ')) results = wikipedia.search(search_input, 3) i = 0 for result in results: print('{}.. {}'.format(i, result)) i += 1 choice = int(input('Choose Page: ')) choice = wikipedia.page(results[choice]) print(choice.title) print(choice.summary) print(choice.url) if menu_input == 'p': choice = wikipedia.page(str(input('Page Title: '))) print(choice.title) print(choice.summary) print(choice.url) menu_input = str( input('Enter "s" for search, enter "p" for page title \n>>>'))
(wp_title text primary key, page_url text, summary text, parent_id integer, revision_id integer)''') titles = set( x[0] for x in wpedia.execute('select distinct wp_title from wikipedia')) titles_done = set( x[0] for x in wpedia.execute('select wp_title from wp_page_info')) for t in (titles - titles_done): print(t) page = None try: page = wikipedia.page(t) except Exception as e: # catch everything and ignore print(e, file=sys.stderr) try: page = wikipedia.page(t.replace(' ', '_'), auto_suggest=False) except Exception as e: print(e, file=sys.stderr) if not page: continue values = (t, page.summary, page.url, int(page.parent_id), int(page.revision_id)) wpedia.execute( '''insert or ignore into wp_page_info (wp_title, summary, page_url, parent_id, revision_id) values (?,?,?,?,?)''', values)
prefix = "|" class Paragraph: title: str content: str def __init__(self, title, content): self.title = title self.content = content def toDict(self): _dict = {"title": self.title, "content": self.content} p = wikipedia.page("Jazz") text = p.content text = text.replace("=====", prefix) text = text.replace("====", prefix) text = text.replace("===", prefix) text = text.replace("==", prefix) tag = False tags = [] tagname = "" for i in range(len(text)): char = text[i] if char == prefix: if tag == False: tag = True
temp_key = int(keys[i]) person_name = person[temp_key] # print(person_name) # print(connected[keys[i]]) # print(connected[keys[i]][0]) search = wiki.search(person_name) if (len(search) != 0): count += 1 max_match = 0 max_index = 0 # flag = False for j in range(len(search)): # flag = False try: summary = wiki.page(search[j]).content temp_match = 0 for k in range(len(connected[keys[i]])): if connected[keys[i]][k].lower() in summary.lower(): temp_match += 1 if (max_match < temp_match): max_match = temp_match max_index = j except: print("") temp_row = str(temp_key) + '\t' + person_name + "\t" + search[ max_index] + "\t" + str(max_match) + "\t" + str( len(connected[keys[i]])) + "\n" print(temp_row) fin.write(temp_row)
def __init__(self, title): self.page = wikipedia.page(title) self.summary = TextBlob(self.page.summary)
async def search_wikipedia(self, ctx: commands.Context, args): """Fait une recherche sur wikipd""" wait = await ctx.send("_Je cherche..._") results = wikipedia.search(args) nbmr = 0 mmssgg = "" for value in results: nbmr = nbmr + 1 mmssgg = mmssgg + "**{}**: {} \n".format(str(nbmr), value) em = discord.Embed(title='Résultats de : ' + args, description=mmssgg, colour=0x4ECDC4) em.set_thumbnail(url="https://upload.wikimedia.org/wikipedia/commons/" "2/26/Paullusmagnus-logo_%28large%29.png") await wait.delete() sending = ["1⃣", "2⃣", "3⃣", "4⃣", "5⃣", "6⃣", "7⃣", "8⃣", "9⃣", "🔟"] def check(reaction, user): return user == ctx.author and reaction.emoji in sending and \ reaction.message.id == msg.id async def waiter(future: asyncio.Future): reaction, _ = await self.bot.wait_for('reaction_add', check=check) future.set_result(reaction.emoji) emoji = asyncio.Future() self.bot.loop.create_task(waiter(emoji)) msg = await ctx.send(embed=em) for e in sending: await msg.add_reaction(e) if emoji.done(): break while not emoji.done(): await asyncio.sleep(0.1) page = int(sending.index(emoji.result())) args_ = results[page] try: await msg.delete() await ctx.trigger_typing() wait = await ctx.send( ctx.message.author.mention + " ah ok sympa cette recherche, je l'effectue de suite !") wp = wikipedia.page(args_) wp_contenu = wp.summary[:200] + "..." em = discord.Embed(title='Wikipedia : ' + wp.title, description="{} \n_Lien_ : {} ".format( wp_contenu, wp.url), colour=0x9B59B6) em.set_author(name="Wikipedia", url='http://wikipedia.org', icon_url='https://upload.wikimedia.org/wikipedia/' 'commons/2/26/Paullusmagnus-logo_%28large' '%29.png') em.set_thumbnail(url="https://upload.wikimedia.org/wikipedia/" "commons/2/26/Paullusmagnus-logo_%28large" "%29.png") em.set_footer(text="Merci à eux de nous fournir une encyclopédie " "libre !") await wait.delete() await ctx.send(embed=em) except wikipedia.exceptions.PageError: # TODO : A virer dans l'event on_error await ctx.send(":open_mouth: Une **erreur interne** est survenue," " si cela ce reproduit contactez votre" " administrateur ou faites une Issue sur" " ``gitea`` !")
def wiki_get_coordinates(places_list): ''' Get location coordinate of places in a list Args: places_list: string list of places Return: coord_list: list of coordinates of the inputted places not_found: list of places without wikipedia page available ''' import wikipedia coord_list = [] not_found = [] for n, value in enumerate(places_list): try: coord = wikipedia.page(value).coordinates keywords = value #print(value + " NPP", "coordinates is ", coord) except (KeyError, wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError): #print("No wikipedia page named", value) new_value = value + " Nuclear Power Plant" #print("Search using keywords:", new_value) try: coord = wikipedia.page(new_value).coordinates keywords = new_value #print(new_value, "coordinates is ", coord) except (KeyError, wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError): #print("No wikipedia page named", new_value) new_value2 = value[:-2] + " NPP" #print("Search using keywords:", new_value2) try: coord = wikipedia.page(new_value2).coordinates keywords = new_value2 #print(new_value2, "coordinates is ", coord) except (KeyError, wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError): #print("No wikipedia page named", new_value2) new_value3 = new_value2[:-4] + " Nuclear Power Plant" #print("Search using keywords:", new_value3) try: coord = wikipedia.page(new_value3).coordinates keywords = new_value3 #print(new_value3, "coordinates is ", coord) except (KeyError, wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError): #print("No wikipedia page named", new_value3) new_value4 = new_value2[:-6] + " NPP" #print("Search using keywords:", new_value4) try: coord = wikipedia.page(new_value3).coordinates keywords = new_value3 #print(new_value3, "coordinates is ", coord) except (KeyError, wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError): #print("No wikipedia page named", new_value3) new_value5 = new_value2[:-6] + " Nuclear Power Plant" #print("Search using keywords:", new_value5) try: coord = wikipedia.page(new_value5).coordinates keywords = new_value5 #print(new_value5, "coordinates is ", coord) except (KeyError, wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError): #print("No wikipedia page named", new_value5) not_found.append(value) keywords = value coord = (0, 0) coordinate = [keywords, float(coord[0]), float(coord[1])] coord_list.append(coordinate) return coord_list, not_found
""" CP1404 - Practicals Wikipedia """ import wikipedia user_search = input("Search: ") while user_search != "": wiki_page = wikipedia.page(user_search) try: print("Title: " + wiki_page.title) print(wikipedia.summary(user_search)) print("URL: " + wiki_page.url) user_search = input("Search: ") except wikipedia.exceptions.DisambiguationError as e: print(e.options) user_search = input("Search: ")
def index(request, lemma): # # Check if we found something in our own sparql repository. If not # query other sources. # # TODO: We need a better check (persons with the same name). # #if not sparql_results or not sparql_results["results"]["bindings"]: if False: # # DBPEDIA # sparql = SPARQLWrapper(DBPEDIA_QUERY_URL) sparql.setQuery(SPARQL_DBPEDIA_QUERY.format(lemma)) sparql.setReturnFormat(JSON) try: sparql_results = sparql.queryAndConvert() except: import traceback print traceback.format_exc() sparql_results = {} #if sparql_results and sparql_results["results"]["bindings"]: # for result in sparql_results["results"]["bindings"]: # from .utils import sparql_local_insert_person # # sparql_local_insert_person(lemma, result) #else: if True: # # CBDB # r = requests.get(CBDB_API_URL.format(lemma)).json() #if r.status_code == 200: try: persons = r['Package']['PersonAuthority']['PersonInfo'][ 'Person'] except: persons = [] if type(persons) is list: for person in persons: print person['BasicInfo']['ChName'], person['BasicInfo'][ 'YearBirth'], person['BasicInfo']['PersonId'] else: person = persons if person: print person['BasicInfo']['ChName'], person['BasicInfo'][ 'YearBirth'], person['BasicInfo']['PersonId'] sparql = SPARQLWrapper(FUSEKI_QUERY_URL) sparql.setQuery(sparql_query.format(lemma)) sparql.setReturnFormat(JSON) try: sparql_results = sparql.queryAndConvert() except: sparql_results = {} sparql = SPARQLWrapper(FUSEKI_QUERY_URL) sparql.setQuery(sparql_query.format(lemma)) sparql.setReturnFormat(JSON) try: sparql_results = sparql.queryAndConvert() except: sparql_results = {} is_person = False template_result = {} if sparql_results.get("results", False): for result in sparql_results["results"]["bindings"]: p = result["p"]["value"].replace(prefix_default, '') p = p.replace(prefix_schema, '') p = p.replace(prefix_syntax, '') o = result["o"]["value"].replace(prefix_default, '') if p == "type" and o == "Person": is_person = True template_result[p] = o template_result['is_person'] = is_person template_result['lemma'] = lemma # Wikipedia try: wikipedia.set_lang("en") en = wikipedia.page(lemma, auto_suggest=True, redirect=True) wikipedia_en = en.summary wikipedia_en_url = en.url except: wikipedia_en = '' wikipedia_en_url = '' try: wikipedia.set_lang("zh") zh = wikipedia.page(lemma, auto_suggest=True, redirect=True) wikipedia_zh = zh.summary wikipedia_zh_url = zh.url except: wikipedia_zh = '' wikipedia_zh_url = '' # Sinology try: f = codecs.open("/docker/dublin-store/sinology/mainSpace/" + lemma, "r", "utf-8") # Skip first line sinology = f.readlines()[1:] sinology = '\n'.join(sinology) sinology = creole.creole2html(sinology) except: sinology = '' return render( request, 'sparql/index.html', { 'r': template_result, 'wikipedia_en': wikipedia_en, 'wikipedia_zh': wikipedia_zh, 'wikipedia_en_url': wikipedia_en_url, 'wikipedia_zh_url': wikipedia_zh_url, 'sinology': sinology, })
def get_wiki_url_and_content_by_keyphrase(phrase): with warnings.catch_warnings(): # TODO warning suppression warnings.simplefilter("ignore") wiki_page = wikipedia.page(phrase) return wiki_page.url, wiki_page.summary, wiki_page.categories
import wikipedia query=wikipedia.page("Hero") print(query.summary)
with mss.mss() as sct: while Number > 0: im3 = numpy.asarray(sct.grab(grabOption3)) # im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY) text3 = "River Wye" Number = Number - 1 q1point = 0 q2point = 0 q3point = 0 #Searches the question Questions = wikipedia.page(textq) Question1 = Questions.links #Count how many times each come up in the Question links1 = [text1] test = [] for link in links1: try: #try to load the wikipedia page page = wikipedia.page(link, auto_suggest=False) test.append(page) except wikipedia.exceptions.PageError: #if a "PageError" was raised, ignore it and continue to next link continue
import wikipedia import requests import cairosvg PAGES = ['2016 Summer Olympics', 'London'] for page in PAGES: wikipage = wikipedia.page(page) print("Page Title: ", wikipage.title) print("Page URL: ", wikipage.url) cairosvg.svg2png( url= "https://upload.wikimedia.org/wikipedia/en/d/df/2016_Summer_Olympics_logo.svg", write_to="/WikimediaDataLiquidGalaxy/static/images/image2222.png") print(" - Main Image: ", wikipage.images)
def get_wikipedia_intro(symbol): import wikipedia company_name = get_fundamental_information(symbol)[0] description = wikipedia.page(company_name).content return(description.split('\n')[0])
def parseWiki(self): self.text = wikipedia.page(title = self.webpage).content self.text = self.CleanText()
def assistant(self,command): #open subreddit Reddit if 'open reddit' in command: reg_ex = re.search('open reddit (.*)', command) url = 'https://www.reddit.com/' if reg_ex: subreddit = reg_ex.group(1) url = url + 'r/' + subreddit webbrowser.open(url) self.sofiaResponse('The Reddit content has been opened for you Sir.') self.sofia('The Reddit content has been opened for you Sir.') self.sofia(url) elif 'shutdown' in command or 'bye' in command or 'tata' in command : self.sofiaResponse('Bye bye. Have a nice day') self.sofia('Bye bye. Have a nice day') sys.exit() elif 'open website' in command: reg_ex = re.search('open website (.+)', command) if reg_ex: domain = reg_ex.group(1) print(domain) url = 'https://www.' + domain + '.com' webbrowser.open(url) self.sofiaResponse('website ' + url + ' is opened') self.sofia('website' + url + 'is opened') #self.label6 = tk.Label(self.root, text='website' + url + 'is opened') #self.label6.grid() else: pass # wait until thread 1 is completely executed elif 'play youtube video for' in command: reg_ex = re.search('youtube (.+)', command) if reg_ex: domain = reg_ex.group(1) print(domain) url = 'https://www.youtube.com/results?search_query=' + domain webbrowser.open(url) self.sofiaResponse('The youtube videos are available.') self.sofia('The youtube videos are available.' + url) #self.label6a = tk.Label(self.root, text='The youtube videos are available.' + url) #self.label6a.grid() else: pass #google search elif 'google' in command or 'please google' in command: #what happens when google keyword is recognized reg_ex = re.search('google (.+)', command) words = command.split() del words[0:1] st = ' '.join(words) print('Google Results for: '+ str(st)) url='http://google.com/search?q='+ st webbrowser.open(url) self.sofiaResponse('Google Results for: '+str(st)) self.sofia('Google Results for: '+str(st) + url) #greetings elif 'hello' in command or 'hey' in command: day_time = int(strftime('%H')) if day_time < 12: self.sofiaResponse('Hello ASH. Good morning') self.sofia('Hello ASH. Good morning') elif 12 <= day_time < 18: self.sofiaResponse('Hello ASH. Good afternoon') self.sofia('Hello ASH. Good afternoon') else: self.sofiaResponse('Hello ASH. Good evening') self.sofia('Hello ASH. Good evening') #joke elif 'joke' in command: res = requests.get( 'https://icanhazdadjoke.com/', headers={"Accept":"application/json"}) if res.status_code == requests.codes.ok: self.sofiaResponse(str(res.json()['joke'])) self.sofia(str(res.json()['joke'])) else: self.sofiaResponse('oops!I ran out of jokes') self.sofia('oops!I ran out of jokes') #top stories from google news elif 'news for today' in command or 'news' in command: try: news_url="https://news.google.com/news/rss" Client=urlopen(news_url) xml_page=Client.read() Client.close() soup_page=soup(xml_page,"xml") news_list=soup_page.findAll("item") for news in news_list[:3]: self.sofiaResponse(news.title.text.encode('utf-8')) self.sofia(news.title.text) except Exception as e: print(e) #current weather elif 'current weather' in command or 'weather' in command: reg_ex = re.search('current weather in (.*)', command) if reg_ex: city = reg_ex.group(1) owm = OWM(API_key='ab0d5e80e8dafb2cb81fa9e82431c1fa') obs = owm.weather_at_place(city) w = obs.get_weather() k = w.get_status() x = w.get_temperature(unit='celsius') self.sofiaResponse('Current weather in %s is %s. The maximum temperature is %0.2f and the minimum temperature is %0.2f degree celcius' % (city, k, x['temp_max'], x['temp_min'])) self.sofia('Current weather in %s is %s. The maximum temperature is %0.2f and the minimum temperature is %0.2f degree celcius' % (city, k, x['temp_max'], x['temp_min'])) #time elif 'time' in command: import datetime now = datetime.datetime.now() self.sofiaResponse('Current time is %d hours %d minutes' % (now.hour, now.minute)) self.sofia('Current time is %d hours %d minutes' % (now.hour, now.minute)) #email elif 'please email' in command: self.sofiaResponse('Who is the recipient?') recipient = self.myCommand() if 'ash' in recipient: msg = MIMEMultipart() # storing the senders email address msg['From'] = fromaddr # storing the receivers email address msg['To'] = toaddr # storing the subject msg['Subject'] = "hello" # string to store the body of the mail body = "Body_of_the_mail" # attach the body with the msg instance msg.attach(MIMEText(body, 'plain')) # open the file to be sent filename = "final forward.pdf" attachment = open("C:/Users/AISHU/Desktop/final forward.pdf", "rb") # instance of MIMEBase and named as p p = MIMEBase('application', 'octet-stream') # To change the payload into encoded form p.set_payload((attachment).read()) # encode into base64 encoders.encode_base64(p) p.add_header('Content-Disposition', "attachment; filename= %s" % filename) # attach the instance 'p' to instance 'msg' msg.attach(p) # creates SMTP session s = smtplib.SMTP('smtp.gmail.com', 587) # start TLS for security s.starttls() # Authentication s.login(fromaddr,"xyfsfvqsgawkfhjh") # Converts the Multipart msg into a string text = msg.as_string() # sending the mail s.sendmail(fromaddr, toaddr, text) # terminating the session s.quit() self.sofiaResponse('Email has been sent successfuly. You can check your inbox.') self.sofia('Email has been sent successfuly. You can check your inbox.') else: self.sofiaResponse('I don\'t know what you mean!') self.sofia('I don\'t know what you mean!') #launch any folder elif 'from desktop view folder ' in command or 'view folder' in command: reg_ex = re.search('from desktop view folder (.*)', command) if reg_ex: appname = reg_ex.group(1) appname1 = appname #subprocess.call(["open", "-n", "/C:\/" + appname1], stdout=subprocess.PIPE) os.startfile('C:/Users/AISHU/Desktop/' + appname1) #subprocess.call() self.sofiaResponse('I have launched the desired application') self.sofia('I have launched the desired application') #calculation elif 'calculate' in command: reg_ex = re.search('calculate (.*)', command) app_id = "AUXH6Q-LA7AA5J66V" client = wolframalpha.Client(app_id) indx = command.lower().split().index('calculate') query = command.split()[indx + 1:] res = client.query(' '.join(query)) answer = next(res.results).text self.sofiaResponse("The answer is " + answer) self.sofia("The answer is " + answer) elif 'thank you' in command or 'thanks' in command: self.sofiaResponse('your welcome') self.sofia('your welcome') elif 'help me' in command: self.sofiaResponse(""" You can use these commands and I'll help you out: 1-. Open reddit subreddit 2. Open website 3. play youtube video for 4. please google 5. from desktop view folder 6. news for today 7. Joke 8. Send email/email 9. Current weather in {cityname} 10. change wallpaper 11. Time 12. tell me about xyz 13. Calculate 14 where is {location} 15. launch app """) #wallpaper elif 'change wallpaper' in command: reg_ex = re.search('change wallpaper (.*)', command) path_user = os.path.expanduser('~') if reg_ex: appname = reg_ex.group(1) appname1 = appname +'.jpg' #name_of_file = 'img.jpg' path_to_file = os.path.join(path_user,'Desktop','wallpaper',appname1) print(path_to_file) # this print C:\Users\Sebastian\Desktop\wallpapers\vRATOkv.jpg SPI_SETDESKWALLPAPER = 20 ctypes.windll.user32.SystemParametersInfoW(SPI_SETDESKWALLPAPER, 0, path_to_file, 0) self.sofiaResponse('I have changed the desired wallpaper') self.sofia('I have changed the desired wallpaper') #launch any app elif 'launch app' in command: reg_ex = re.search('launch app (.*)', command) if reg_ex: appname = reg_ex.group(1) appname1 = appname + ".lnk" #subprocess.call(["open", "-n", "/C:\/" + appname1], stdout=subprocess.PIPE) os.startfile('C:/ProgramData/Microsoft/Windows/Start Menu/Programs/' + appname1) #subprocess.call() self.sofiaResponse('I have launched the desired application') self.sofia('I have launched the desired application') #meanings elif 'tell me about' in command: reg_ex = re.search('tell me about (.*)', command) if reg_ex: topic = reg_ex.group(1) ny = wikipedia.page(topic) self.sofiaResponse(ny.content[:500].encode('utf-8')) self.sofia(ny.content[:500].encode('utf-8')) #location elif "where is" in command: reg_ex = re.search('where is (.*)', command) command = command.split(" ") location = command[2] self.sofiaResponse("Hold on ASH, I will show you where " + location + " is.") webbrowser.open("https://www.google.nl/maps/place/" + location + "/&") self.sofia("Hold on ASH, I will show you where " + location + " is." + "https://www.google.nl/maps/place/" + location + "/&" ) else: self.sofiaResponse('sorry i dont understand please rephrase your sentence') self.sofia('sorry i dont understand please rephrase your sentence')
while left > 0: get = min(per_cycle, left) left -= get yield wikipedia.random(pages=get) def process(bunch): for word in bunch: print word for titles in get_random_wikipedia_titles(amount): bunch = [] for title in titles: word = {} try: page = wikipedia.page(title=title) except wikipedia.exceptions.DisambiguationError: print "Disambiguation in title %s, skipping"%title continue word['word'] = title word['description'] = page.summary word['translations'] = {} try: word['imageurl'] = page.images[0] except: word['imageurl'] = "http://www.catster.com/files/original.jpg" bunch.append(word) for lang in langs: translations = gs.translate([word['word'] for word in bunch], lang, 'fi') for translation, word in zip(translations, bunch):
# ---------------------------------------------------- # # WikiWriter # Section Scrapers # By Kristina Wagner and Sharon Lin # Copyright April 2016 # -----------------------------------------------------# #Note that subheaders always begin with "=" def getSectionHeaders(article): headers = [] startIndex = 0 while startIndex != -1: #Find Start startIndex = article.find("==") #Find end subarticle = article[startIndex+2:] endIndex = subarticle.find("==") if ((startIndex > -1) and (endIndex > -1)): if (startIndex != endIndex): if (endIndex - startIndex < 100 and endIndex > startIndex): header = article[startIndex+2:startIndex+2+endIndex] header = header.strip(' \t\n\r') if header != "" and header != "\n" and header != "=": print (header) headers.append(header) article = article[endIndex+2:] return headers article = wikipedia.page("Obama").content getSectionHeaders(article)
#test-docx.py from docx import Document import wikipedia wikipedia.set_lang('th') #summary สำหรับบทความที่่สรุป data = wikipedia.summary('ประเทศไทย') #page+content บทความทั้้งหน้า data2 = wikipedia.page('ประเทศไทย') data2 = data2.content doc = Document() #สร้างไฟล์ เวิร์ด ใน python doc.add_heading('แมว',0) doc.add_paragraph(data2) doc.save('ประเทศไทย.docx') print('สร้างไฟล์สำเร็จ')
def summary(): query= wikipedia.page(question.get()) answer=Text(root,height=100,width=160,font=("Arial",14),wrap=WORD,bg="#7CEBC6" ,fg="black") answer.insert(END,(query.summary)) answer.pack()
if not os.path.isfile(str(word + "_tot.json")): print(word) banlist.append(word) for word in banlist: print(word) random.seed(word) current = "" if word in done: continue with open(word + ".json", 'r') as f: titles = json.load(f) while len(current) < target: rand_index = random.randrange(len(titles)) try: content = wikipedia.page(titles[rand_index]).content current += "\n" + content except wikipedia.PageError: continue except wikipedia.DisambiguationError: continue except wikipedia.WikipediaException: print("wikipedia exception") time.sleep(5) pass except: print("unknown error") time.sleep(5) continue done.append(word) with open(word + "_tot.json", 'w') as f:
def getPage(search): return wikipedia.page(search)
import pandas as pd import wikipedia as wp sp_500_current = [] # Get the html source html = wp.page("List of S&P 500 companies").html().encode("UTF-8") df = pd.read_html(html)[0] for key, row in df.iterrows(): sp_500_current.append(row["Symbol"]) print(len(sp_500_current), sp_500_current)
def get_wiki(self): title = w.search(self.query)[0] # get first result page = w.page(title) # get page return page.content # return page content
break elif compute_jaccard_index( str(title_no_year + " (" + year + " film" + ")"), result) >= 1: current_query = result break # elif "film" in result: # current_query = result # break else: current_query = "no_results" print "current_query:\t\t", current_query, "/ for movie: ", full_title if current_query != "no_results": try: movie_page = wikipedia.page(current_query) go_flag = True except: print "PLOT------DisambiguationError for:", full_title go_flag = False if go_flag: section_results = [ unicodedata.normalize('NFKD', x).encode('ascii', 'ignore') for x in movie_page.sections ] # f.write(str(full_title + "\t" + current_query + "\t" + str(search_results) + "\n")) print "sections for\t\t", current_query, section_results, "\n" if "Plot" in section_results: # print movie_page.section("Plot").replace("\n"," ") # plot = movie_page.section("Plot")
def get_wiki(query): title = wikipedia.search(query)[0] page = wikipedia.page(title) return page.content
def wiki_look(context): author = wikipedia.page(context.auname) author.sections print("looks for author info using Wikipedia API") #mock
import wikipedia data=wikipedia.page("Donald Trump") file = open('output_wiki_DonaldTrump.txt', 'w',encoding='utf8') file.write(data.content) file.close()
args.gradient_accumulation_steps=1 args.local_rank=-1 args.fp16=False args.loss_scale=0 args.server_ip='' args.server_port='' run_squad.main(args) with open('../files/results/nbest_predictions.json', 'r') as f: output = f.read() return json.loads(output) if __name__ == '__main__': page = wikipedia.page('History of the United Kingdom') input = { "data" : [ { "questions" : [ "What sactions are in place?" ], "url" : "https://www.bbc.co.uk/news/world-us-canada-48748544", "context" : str(page.content) } ] } input = json.dumps(input) answers = answer_questions(input) print(answers)