def extract_Wikipage_from_title(wiki_title, verbose=True): try: wiki_text = wikipedia.WikipediaPage(wiki_title).content return wiki_text except: return [] if verbose: print(wiki_title, 'not found.')
def __get_wiki_page(self): try: return wiki.page(self.search, auto_suggest=True) except PageError: print( 'No Page Found. Try adding a year to your query or check if information is correct.' ) return wiki.WikipediaPage('Main_Page')
def find_latin_name(nom_fr): """ Fonction permettant à partir d'un nom commun d'espèce en français de ressortir un dictionnaire avec pour clé le nom commun et comme valeurs: - le nom commun français - le lien wikipedia de ce nom commun - son rang taxonomique - le nom latin """ global u dico_fr = dict() wikipedia.set_lang("fr") key = nom_fr search = wikipedia.search(nom_fr) if search == list(): taxon = "taxon non trouvé" nom_latin = "nom latin non trouvé" url = "https://fr.wikipedia.org/" dico_fr[key] = [key, url, taxon, nom_latin] return dico_fr search = search[0] wiki = wikipedia.WikipediaPage(search) url = wiki.url requete = requests.get(url) page = requete.content soup = BeautifulSoup(page, features="lxml") if soup.find("div", {"class": "center taxobox_classification"}) is None: nom_latin = "nom latin non trouvé" else: nom_latin = soup.find("div", { "class": "center taxobox_classification" }).text for count, i in enumerate(nom_latin): u = int() if count == 0: continue elif i == " ": continue elif i == "'": continue elif i == ".": continue elif i.upper() == i: u = count break else: u = 20 nom_latin = nom_latin[:u] if soup.find("p", {"class": "bloc"}) is None: taxon = "taxon non trouvé" else: taxon = soup.find("p", {"class": "bloc"}).text # création du dictionnaire de sortie dico_fr[key] = [key, url, taxon, nom_latin] return dico_fr
def printContent(page_title): # page_title = "Information_retrieval" url_wiki = urllib2.unquote(page_title).decode('utf8') wiki_main = "https://en.wikipedia.org" folder = "/wiki/" + page_title data_read = urllib2.urlopen(wiki_main + folder).read() soup = BeautifulSoup(data_read, 'html.parser') wiki_content_py = wikipedia.WikipediaPage(url_wiki).content.encode("utf-8") div_id_content = soup.find("div", {"class": "mw-parser-output"}) try: div_id_content.find("div", {"id": "toc"}).decompose() except AttributeError: pass try: div_id_content.find("table", { "class": "vertical-navbox nowraplinks plainlist" }).decompose() except AttributeError: pass try: div_id_content.find( "table", { "class": "plainlinks metadata ambox ambox-content ambox-multiple_issues compact-ambox" }).decompose() except AttributeError: pass try: div_id_content.find("table", {"role": "presentation"}).decompose() except AttributeError: pass if div_id_content is not None: a_tags = div_id_content.findAll('a', href=True) wiki_links_page_title = list() for a_tag in a_tags: if a_tag['href'][0:6] == "/wiki/" and len( a_tag.text.encode("utf-8")) > 2: wiki_links_page_title.append(a_tag) start_index = 0 for link in wiki_links_page_title: text = link.text.encode("utf-8") link = link['href'][6:].encode("utf-8") found_at = wiki_content_py.find(text, start_index) if found_at != -1: wiki_content_py = wiki_content_py[:found_at] + "[[" + link + "||" + wiki_content_py[ found_at:found_at + len(text)] + "]]" + wiki_content_py[found_at + len(text):] start_index = found_at + 4 + len(link) lines = wiki_content_py.splitlines() for line in lines: if count_aphabets(line) > 3 and not (line.find("{") != -1 and line.find("}") != -1): print line
def make_card(user_input): try: pg = wikipedia.WikipediaPage(title=user_input) except: p = wikipedia.search(query=user_input, suggestion=True) user_input = p[0][0] pg = wikipedia.WikipediaPage(title=user_input) try: pghtml = pg.html() soup = BeautifulSoup(pghtml, 'html.parser') table = soup.table # row = table.findAll('tr') heading = table.findAll('th') data = table.findAll('td') # heads = table.findAll('tr') except: return " " info_box = [] for h, d in zip(heading[:7], data[:7]): info_box.append(h.get_text() + ': ' + d.get_text()) # for head in heads[:7]: # info_box.append(head.get_text()) def removeNestedParentheses(s): ret = '' skip = 0 for i in s: if i == '[': skip += 1 elif i == ']' and skip > 0: skip -= 1 elif skip == 0: ret += i return ret print(len(info_box)) if len(info_box) > 2: for x in range(len(info_box)): info_box[x] = removeNestedParentheses(info_box[x]) return info_box[1:] else: return " "
def get_image(user_input): pg = wikipedia.WikipediaPage(title=user_input) html_page = pg.html() bs = BeautifulSoup(html_page, 'html.parser') use_less = "//upload.wikimedia.org/wikipedia/commons/thumb/9/98/Ambox_current_red.svg/42px-Ambox_current_red.svg.png" image = bs.findAll('img', height=True)[0] if (image.get('src') == use_less): image = bs.findAll('img')[1] return "https://" + image.get('src')[2:]
def get_paragraphs(question: str): search_results = wikipedia.search(question) if (search_results is None): print("couldn't find results!") return None target = search_results[0] page = wikipedia.WikipediaPage(title=target) contents = page.content return get_five_paragraphs(contents)
def scraping(title,movies): intro=[] plot=[] plot_s=[] story=[] import wikipedia for i in range(len(movies)): try: intro.append(wikipedia.WikipediaPage(title = title[i]).summary) plot.append(wikipedia.WikipediaPage(title[i]).section('Plot')) plot_s.append(wikipedia.WikipediaPage(title[i]).section('Plot summary')) print("Current progress",np.round(i/len(title)*100,2),"%") print(datetime.datetime.now().time()) except wikipedia.DisambiguationError: #in case it finds a DisambiguationPage he appenda to intro,plot,plot_s: ambiguos intro.append('ambiguos') plot.append('ambiguos') plot_s.append('ambiguos')
def getDOB(title): DOB = '' html = BeautifulSoup(wikipedia.WikipediaPage(title).html(), 'html.parser') try: DOB = html.find('span', class_="bday").string except: DOB = html.find('span', class_="dtstart bday").string return DOB
def grab_content(page_id, clean=True): try: page_content = wikipedia.WikipediaPage(pageid=page_id).content except: page_content = '' if clean: return cleaner(page_content) else: return page_content
def get_title(title): try: results, suggestion = wikipedia.search(title, results=1, suggestion=True) title = suggestion or results[0] return wikipedia.WikipediaPage(title) except: return None
async def wiki(query): 'wikipedia' if query.startswith('-'): lang, search = query.split(None, 1) lang = lang[1:] try: wikipedia.set_lang(lang) out_message = wikipedia.WikipediaPage(wikipedia.search(search)[0]).content except wikipedia.DisambiguationError as e: out_message = str(e) try: #out_message = wikipedia.summary(t.group(2), sentences=30) #out_message = wikipedia.WikipediaPage(query).content out_message = wikipedia.WikipediaPage(wikipedia.search(query)[0]).content except wikipedia.DisambiguationError as e: out_message = str(e) return out_message
def get_tfidf_doc(docTitle): sortedTitles = tfidf.tfidf(docTitle, nTitles=20) documents = list() for title in sortedTitles: wikisearch = wikipedia.WikipediaPage(title[0]) wikicontent = wikisearch.links documents.append(wikicontent) return sortedTitles, documents
def wiki(celestial_object): ans = celestial_object cwd = os.getcwd() with open(os.path.join(cwd, 'display_info.yml'), 'r') as stream: all_display_statistics = load(stream, Loader=SafeLoader) req_statistics = all_display_statistics.get(ans, {}) if ans in ["spiral", "elliptical"]: print("--------------------------------------------------------") print("Classified Celestial Object is {} Galaxy : ".format( ans.capitalize())) print("-------------------------------------------------------- \n") # print(wikipedia.summary("Spiral Galaxy", sentences=2)) print(wikipedia.WikipediaPage(title='{} galaxy'.format(ans)).summary) elif ans in [ 'mercury', 'venus', 'earth', 'mars', 'jupiter', 'saturn', 'uranus', 'neptune' ]: print("--------------------------------------------------------") print("Classified Celestial Object is {} Planet : ".format( ans.capitalize())) print("-------------------------------------------------------- \n") statistics = "\n".join([ '-- {}: {}'.format(parameter, value) for parameter, value in req_statistics.items() ]) print("{}\n\n".format(statistics)) # print(wikipedia.summary("Mercury (planet)", sentences=2)) print(wikipedia.WikipediaPage(title='{} (planet)'.format(ans)).summary) elif ans in [ 'moon', 'stars', 'nebula', 'supernova', 'cluster_of_galaxies' ]: print("--------------------------------------------------------") print("Classified Celestial Object is the {} : ".format( ans.capitalize())) print("-------------------------------------------------------- \n") statistics = "\n".join([ '-- {}: {}'.format(parameter, value) for parameter, value in req_statistics.items() ]) print("{}\n\n".format(statistics)) print(wikipedia.WikipediaPage(title='{}'.format(ans)).summary) return " "
def main(): wiki_article_sentences = {} selected_key_concept = "Natural language processing" wiki_article_sentences[selected_key_concept] = [ 'Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora.', ' Natural language processing is used for various purposes.' ] sentences = wiki_article_sentences[selected_key_concept] dir = os.path.join(folderpath, "Definition/Model") p = re.compile('\([^()]*\)') test_obj = testing(dir) test_obj.load_model() check_def_in_text = False defnitions = [] for sent in sentences: orig_sent = sent if 10 < len( sent ): # To remove sentences with single word or very few words or which are empty processed_1 = p.sub('', sent) processed_2 = remove_non_ascii(processed_1) processed_3 = processed_2.replace('[', '').replace(']', '').replace( ';', ',') result = test_obj.predic_classes([processed_3]) if result[0][0] == 1: defnitions.append(orig_sent) check_def_in_text = True print("Definition found") check_def_in_wiki = False if check_def_in_text == False: check_def_in_wiki = False defnitions = [] obj = wikipedia.WikipediaPage(selected_key_concept) summarised_text = obj.summary wikipedia_sentences = nltk.sent_tokenize(summarised_text) for sent in wikipedia_sentences: orig_sent = sent if 10 < len( sent ): # To remove sentences with single word or very few words or which are empty processed_1 = p.sub('', sent) processed_2 = remove_non_ascii(processed_1) processed_3 = processed_2.replace('[', '').replace(']', '').replace( ';', ',') result = test_obj.predic_classes([processed_3]) if result[0][0] == 1: definition_found = orig_sent check_def_in_wiki = True defnitions.append(orig_sent) print(defnitions)
def get_summary(self, queries, prev_summary=None, prev_failed=None): if prev_summary and prev_failed: log.info('Reusing previous summaries.') pd_prev_summaries = pd.read_csv(prev_summary, sep='\t', keep_default_na=False) pd_prev_failed = pd.read_csv(prev_failed, sep='\t', keep_default_na=False) known_successful_queries = pd_prev_summaries['query'].tolist() known_failed_queries = pd_prev_failed['query'].tolist() known_queries = known_successful_queries + known_failed_queries depracated_queries = [q for q in known_queries if q not in queries] queries = [q for q in queries if q not in known_queries] log.info('Found %d deprecated queries', len(depracated_queries)) log.info('Found %d new queries', len(queries)) summaries = [] failed_queries = [] num_queries = len(queries) log_every = [i * int(num_queries / NUM_LOGS) for i in range(NUM_LOGS)] for idx, query in enumerate(queries): if idx in log_every: log.info('Processing query {}/{}'.format(idx, num_queries)) try: summary = wikipedia.WikipediaPage(query).content.replace( '\n', ' ') summaries.append([query, summary]) except: failed_queries.append([query]) pd_summaries = pd.DataFrame(summaries, columns=['query', 'summary']) pd_failed = pd.DataFrame(failed_queries, columns=['query']) if prev_summary and prev_failed: pd_summaries = pd_summaries.append(pd_prev_summaries) pd_failed = pd_failed.append(pd_prev_failed) pd_summaries = pd_summaries[~pd_summaries['query']. isin(depracated_queries)] pd_failed = pd_failed[~pd_failed['query'].isin(depracated_queries)] log.info('Successfully got wikipedia summaries for %d queries', pd_summaries.shape[0]) log.info('Failed to get wikipedia summaries for %d queries', pd_failed.shape[0]) return pd_summaries, pd_failed
def wiki_summary(text): text = text + ' wikipedia' search_results = google_search(text) wiki_list = search_results[0].split('/') search_query = wiki_list[-1].replace('_', ' ') summary = (wikipedia.WikipediaPage(title=search_query).summary) print(summary) return (summary)
def main(): tupledict, tupletotal = get_tuples(sys.argv[1], int(sys.argv[2])) page = sys.argv[3] text = wikipedia.WikipediaPage(page).content.encode('ascii', 'xmlcharrefreplace') score = 0.0 for tupletext in tupledict: if tupletext in text: score = score + tupledict[tupletext] score = (score / tupletotal) print "score=%f page=%s" % (score, page)
def suggest(request): title = request.GET['q'] result = wiki.WikipediaPage(title=title).html() return render( request, template_name="wikipage.html", context={"article": { "title": title, "html_content": result }})
def get_links(self, wiki_page): if wiki_page.title() in self.tested_pages: raise IOError('Link already found') self.tested_pages.add(wiki_page.title()) wiki_links = wikipedia.WikipediaPage(wiki_page.title()).links return wiki_links
def scrape_data(topics): for topic in topics: topic_to_noun[topic] = {} try: content = wikipedia.WikipediaPage(topic).content except wikipedia.exceptions.DisambiguationError as e: print("Error: {0}".format(e)) extract_nouns(content, topic) try: external_links = wikipedia.WikipediaPage(topic).links except wikipedia.exceptions.DisambiguationError as e: print("Error: {0}".format(e)) for links in external_links: #follow the links and extract content try: link_content = wikipedia.WikipediaPage(links).content except wikipedia.exceptions.DisambiguationError as e: print("Error: {0}".format(e)) extract_nouns(link_content, topic) print("--- Finished scraping topic! " + str(topic) + " ----")
def get_historic_events(): date = datetime.now() date = date.strftime('%B') + ' ' + str(date.day) page = wikipedia.WikipediaPage(date) soup = BeautifulSoup(page.html(), features="html.parser") events = soup.find("span", id="Events").find_next("ul").find_all("li") events = [i.text for i in events] events = sample([i for i in events if len(i) < 108], 3) # limit event text length bc/ screen size return jsonify({'events': events}), 200
def generate_dictionary(tag, max_word_length): wiki.set_lang(tag) for topic in config_2.language_tags[tag]: page = wiki.WikipediaPage(topic) content = page.content content = unidecode(content) lst = process(content, max_word_length) return lst
def makeSuits(self): urls = wikipedia.WikipediaPage('Minor Arcana').images return [self.makeName(list(zip([url for url in urls if url.rfind( 'Wands') > 0], names)), 'Wands'), self.makeName(list(zip([url for url in urls if url.rfind( 'Pents') > 0], names)), 'Coins'), self.makeName(list(zip([url for url in urls if url.rfind( 'Cups') > 0], names)), 'Cups'), self.makeName(list(zip([url for url in urls if url.rfind( 'Swords') > 0], names)), 'Swords')]
def viki_wikifull_page(self, query=None): try: data = wikipedia.WikipediaPage(title=query, pageid=None, redirect=True, preload=False, original_title=u'') print data.images except Exception as e: return json.dumps({'error': str(e)})
def auto_name(name): try: for i in wikipedia.search(name, results=3): try: wikipedia.WikipediaPage(i) return i except: pass except: return name
def get_paragraphs(question: str, answers: list): search_results = wikipedia.search(question, results=10) if (search_results is None): print("couldn't find results!") return None page_list = [ wikipedia.WikipediaPage(title=target) for target in search_results ] content_list = [page.content for page in page_list] return get_five_paragraphs(content_list, answers)
def getArticle(articleName): try: articlePage = wikipedia.WikipediaPage(articleName) except wikipedia.exceptions.DisambiguationError as e: print('\nDisambiguation Error: Article name is too broad') except wikipedia.exceptions.PageError as e: print('Article page not found, try a different name') return articlePage
def get_wiki_image_parser(self, search_term, ordinal_num): result = wikipedia.search(search_term, results=1) wikipedia.set_lang("en") wkpage = wikipedia.WikipediaPage(title=result[0]) for image in wkpage.images: if ordinal_num in image: return image return ""
def lookup_state_wiki(state_name): ''' Finds wikipedia page for given state_name Converts html to text ''' page_html = wikipedia.WikipediaPage(state_name).html() if page_html: soup = BeautifulSoup(page_html, "html.parser") return soup.get_text() else: print("State not found")