async def moegirl_search(q): moegirlwiki = MediaWiki(url='http://zh.moegirl.org/api.php') t = moegirlwiki.search(q) if len(t) == 0: return False p = moegirlwiki.page(t[0]) return p.summary
def wikipedia_summary(topic, lang='en'): wikipedia = MediaWiki(lang=lang) search = wikipedia.search(topic) summary = wikipedia.summary(search[0]) text = '**{}**\n\n{}\n**Read more at:** [{}]({})'.format( page.title, summary, page.title, page.url) return text
def apiWikipedia(search, language): print(language, search) if(language == 'pt'): language = 'pt-br' wikipedia = MediaWiki(lang=language) if(len(wikipedia.search(search)) < 1): raise Exception('apiWikipedia: Content not found') page = wikipedia.page(search) return page.summary, page.url
def main(search_term): wikipedia = MediaWiki(lang='pap', user_agent='code-for-nl-pap-parser') wikidata = MediaWiki(url='https://www.wikidata.org/w/api.php', user_agent='code-for-nl-pap-parser') search_result = wikipedia.search(search_term, results=4) for result_item in search_result: page = wikipedia.page(result_item) print( 'I found page \'%s\' for term \'%s\'' % (result_item, search_term), 'with categories', '/'.join(page.categories), 'https://pap.wikipedia.org/wiki/' + urllib.parse.quote(result_item)) # print(page.images) # Now I am going to search this one on wikidata, this will return a code. like Q215887 search_data = wikidata.search(result_item, results=1) for data_item in search_data: Q_CODE = data_item print(result_item, 'is known on wikidata with the code', Q_CODE, 'https://www.wikidata.org/wiki/' + Q_CODE) # Now try the qwikidata interface entity = get_entity_dict_from_api(Q_CODE) q = WikidataItem(entity) pap_data_label = q.get_label(lang='pap') nl_data_label = q.get_label(lang='nl') if pap_data_label and nl_data_label: # First get the page. Read the images found data_page = wikidata.page(result_item) # print(data_page.images) print(pap_data_label, 'is called', nl_data_label, 'in dutch') elif pap_data_label and not nl_data_label: print(pap_data_label, 'has no entry for dutch!') elif not pap_data_label and nl_data_label: print(Q_CODE, 'does not match papiamentu entry') elif not pap_data_label and not nl_data_label: print(pap_data_label, 'has no entry for dutch or papiamentu!')
def get_wikipedia_article(s_word): try: wikipedia = MediaWiki(url=wikiurl) wp_words = wikipedia.search(s_word, results=1) wp_article = wikipedia.page(wp_words[0]) return wp_article except DisambiguationError as e: wp_article = wikipedia.page(random.choice(e.options)) return wp_article except Exception as e: app.logger.info('Exception') app.logger.info(e) return False
class WikiMedia: """Wikipedia class.""" def __init__(self): self.wikipedia = MediaWiki() self.wikipedia.language = "fr" def get_infos(self, query): """Method allowing to retrieve informations from wikipedia.fr.""" try: titles = self.wikipedia.search(query) if len(titles) > 0: infos = self.wikipedia.page(titles[0]) summary = self.wikipedia.summary(titles[0], sentences=3) # Add regex to remove == string == in summary: summary = re.sub(r"={2}\s.+={2}", r"", summary) status = True url = infos.url # Return empty results if no titles are return from the API else: summary = "" url = "" status = False # Use one except block in case of disambiguations errors. # Allow to search for the next title if the first one lead # to a disambiguation error. except mediawiki.exceptions.DisambiguationError: if len(titles) > 1: try: infos = self.wikipedia.page(titles[1]) summary = self.wikipedia.summary(titles[1], sentences=3) summary = re.sub(r"={2}\s.+={2}", r"", summary) url = infos.url status = True except mediawiki.exceptions.DisambiguationError: summary = "" url = "" status = False logging.exception("Exception occurred") else: summary = "" url = "" status = False logging.exception("Exception occurred") return {"summary": summary, "url": url, "status": status}
if not is_not_blank(wikiPage.section(sections[x])): sections[x] = None if sections[x] in bannedSections: sections[x] = None # if "\u0x8211" in sections[x]: # trying to remove the sections with the - not recognised by system (e.g illinois state senator (1997–2004)) # sections[x] = None sections = filter(None, sections) # removing the empty sections actualSection = copy.copy(sections) # make a shallow copy of the list to have the case sensitive sections list for x in range(len(sections)): sections[x] = sections[x].lower() # set the first letter lower case to being user-friendlier :) (actually for speech to text) sections[x] = unidecode(sections[x]) # remove accent if there is one # Suggestions suggestions = wikipedia_mediawiki.search(keyword, 5, False) # get the related topic # Check that the suggested pages exist, if not, remove the suggestion from the list for x in range(len(suggestions)): try: suggestedPage = wikipedia_mediawiki.page(suggestions[x]) # search on wikipedia the suggestion's page except: suggestions[x] = None suggestions = filter(None, suggestions) # removing the empty suggestions for x in range(len(suggestions)): suggestions[x] = suggestions[x].lower() # set the first letter lower case to being user-friendlier :) (actually for speech to text) suggestions[x] = unidecode(suggestions[x]) # remove accent if there is one # Content content = wikiPage.summarize(sentences=3) # get the summary of the wikipedia page
# Если в данных уже есть нужный тег if "wikipedia" in res["tags"]: search_page = wikipedia.page(res["tags"]["wikipedia"][3:]) else: # Отлов ошибок, если в запросе нет имени try: # Поиск по координатам page_names = wikipedia.geosearch(latitude=res["lat"], longitude=res["lon"]) page_name = [ name for name in page_names if check_levenshtein(name, res["tags"].get("name")) ] if not page_name and res["tags"].get("name"): page_names = wikipedia.search(res["tags"].get("name")) page_name = [ name for name in page_names if check_levenshtein(name, res["tags"].get("name")) ] if page_name: search_page = wikipedia.page(page_name[0]) except TypeError: continue if search_page: # Дополняем данные из первого задания page_dict = { "summary": search_page.summary,
p = wikipedia.page(geo_res) data_found = True break # Проверим по геоположению и полю alt_name if not data_found and "alt_name" in value['tags']: for geo_res in geo: if Levenshtein.distance(geo_res, value['tags']['alt_name']) <= 3: p = wikipedia.page(geo_res) data_found = True break # Предпоследняя попытка. Поиск по имени if not data_found: if "name" in value['tags']: name_search = wikipedia.search(value['tags']['name']) for name_res in name_search: if Levenshtein.distance(geo_res, value['tags']['name']) <= 3: p = wikipedia.page(name_res) data_found = True break # Последняя попытка. Поиск по альтернативному имени if not data_found: if "alt_name" in value['tags']: name_search = wikipedia.search(value['tags']['alt_name']) for name_res in name_search: if Levenshtein.distance(
def get_search(search_phrase): wikipedia = MediaWiki(user_agent='chucha-user-agent-string') open_search_result = wikipedia.search(search_phrase) return open_search_result
class DialogueManager: """ Simple Question Answering Dialogue Manager """ def __init__(self, log_path: str, base_model: str) -> None: self._wiki = MediaWiki() self._entity_recognizer = TFLiteNLU(log_path) self._tokenizer = AutoTokenizer.from_pretrained(base_model) self._answerer = TFAutoModelForQuestionAnswering.from_pretrained( base_model) def __call__(self, utterance: str) -> str: result = self._entity_recognizer(utterance) if result.intent == "ask.question": return self._answer(result) elif result.intent == "greet": return self.greet() elif result.intent == "command.exit": return self.exit() elif result.intent == "request.help": return self.help() else: return self.fallback() def _answer(self, result: Result) -> str: if result.slots: # get the tagged entity for page search entity = result.slots.get("entity").get("raw_value") # perform the search to find the wikipedia page entity = self._wiki.search(entity)[0] # get the page content to feed as context to the qa model passage = self._wiki.page(entity, auto_suggest=False).content # prepare qa model inputs inputs = self._tokenizer( result.utterance, passage, return_tensors="tf", padding=True, truncation=True, ) # compute answer span start_scores, end_scores = self._answerer(inputs) start, end = tf.argmax(start_scores, -1)[0], tf.argmax(end_scores, -1)[0] # prepare the passage ids for slicing tokens = self._tokenizer.convert_ids_to_tokens( (inputs["input_ids"].numpy()[0])) # retrieve only the answer from the passage answer = self._tokenizer.convert_tokens_to_string( tokens[start:end + 1]) return answer return "I don't have an answer for that" @staticmethod def greet() -> str: return "Hello, Ask me anything" @staticmethod def exit() -> str: return "Goodbye" @staticmethod def fallback() -> str: return ( "I'm having trouble understanding your request, could you please " "repeat it") @staticmethod def help() -> str: return "Ask a question like, how long is the amazon river?"
medline_clause_extract = medline.to_csv('medline_nih_extract.csv') fifty_fundamental = pd.read_csv( '/Users/gurdit.chahal/Capstone_Data_Mining/w210-herbert/data_sources/fifty_fundamental_herbs_labeled.csv' ) from mediawiki import MediaWiki wikipediamw = MediaWiki() #wikipedia.page(wikipedia.search('Agastache rugosa')[0]) ff_dict = defaultdict(lambda: '') for name in fifty_fundamental.Scientific_Name: try: wikipagemw = wikipediamw.page(name) except wikipediamw.PageError: print(name) search = wikipediamw.search(name) if search: wikipage = wikipediamw.page(search[0]) else: continue #continue forces to loop to next iteration whereas pass goes to rest of loop print("content for: " + name) toc = get_toc_mw( wikipagemw ) #transform ordered dictionairy of sections and subsections to tuples of sections and subsections for section in toc: ff_dict[name, section[0]] += wiki_topic_text(wikipagemw, section) + ' \n ' ff_kept = defaultdict(lambda: '') for row in fifty_fundamental.itertuples(index=True, name='Pandas'): name = getattr(row, 'Scientific_Name')
def getArticle(movie): wiki = MediaWiki() result = wiki.search(movie, results=3) page = wiki.page(result[0]) return page
from mediawiki import MediaWiki from PIL import Image, ExifTags import requests from os.path import isfile, join from os import makedirs import time show_img = False wikipedia = MediaWiki() pages = wikipedia.search('hd,i') pages = [p_name for p_name in pages if not p_name.endswith('(disambiguation)')] for p_name in pages: p = wikipedia.page(p_name) images = p.images for urlimg in images: if any([urlimg.lower().endswith(p) for p in ['svg', 'ogg', 'ogv']]): continue # cannot identify image file <_io.BytesIO object at 0x0000012E68413EB8> try: filename = urlimg.rsplit('/', 1)[1] filename = join(p_name, filename) makedirs(p_name, exist_ok=True) if not isfile(filename): response = requests.get(urlimg, stream=True) trys = 0 while response.status_code != 200 and trys < 5: time.sleep(2) response = requests.get(urlimg, stream=True) print("try again", urlimg)
class WikiApi: def __init__(self): self.wikipedia = MediaWiki(lang='ru') self.wikiquote = CustomWikiEngine(url="https://{lang}.wikiquote.org/w/api.php", lang='ru') def quotes(self, *words): results = [] for word in words: titles = self.wikiquote.quotes(word, results=2) results += titles return results def quote_page(self, title): response = {} try: response = self.wikiquote.page(title=title) except Exception as e: logging.exception(e) return response def get_pages_by_categories(self, category, limit=10): # https://en.wikipedia.org/w/api.php?a # ction=query& # generator=categorymembers& # gcmlimit=100& # gcmtitle=Category:American%20male%20film%20actors& # prop=pageimages& # pilimit=100 S = requests.Session() URL = "https://ru.wikipedia.org/w/api.php" PARAMS = { 'action': "query", 'generator': "categorymembers", 'gcmtitle': category, 'gcmlimit': limit, 'format': "json" } R = S.get(url=URL, params=PARAMS) DATA = R.json() titles = [] if 'query' in DATA and DATA['query'] and DATA['query']['pages']: titles = [value['title'] for key, value in DATA['query']['pages'].items()] return titles def movies(self): # https://ru.wikipedia.org/w/api.php?format=xml&action=query&list=embeddedin&einamespace=0&eilimit=500&eititle=Template:Infobox_film pass def search(self, *words): results = [] for word in words: response = self.wikipedia.search(word, results=4) short_descriptions = response results += short_descriptions return results def opensearch(self, *words): results = [] for word in words: response = self.wikipedia.opensearch(word) results += response return results def parse(self, *pages): results = [] for page in pages: try: response = self.wikipedia.page(title=page) content = response.content sections = re.split(r'==.+?==', content) if sections: summary = sections[0] results.append(summary) section_headers = re.findall(r'== \w+ ==', content) if '== Сюжет ==' in section_headers: index = section_headers.index('== Сюжет ==') + 1 if len(sections) > index: plot = sections[index] results.append(plot) except Exception as e: logging.error(e) return results
class WikiCandidatesSelector: """ Class responsible for model of candidates selection from Wikipedia (Model level one) :param logger: logger to use in model :param separate: if make separate queries for each found entity :param n: number of results return after each query """ def __init__(self, logger=DEFAULT_LOGGER, separate: bool = True, n: int = 3, **kwargs): self.profiler = kwargs.get('profiler', DEFAULT_MEASURER) self.logger = logger self.tagger = SequenceTagger.load('ner-fast') self.wikipedia = MediaWiki() self.separate = separate self.n = n self.logger.info("Candidate selector is loaded and ready to use.") def get_wiki_candidates_raw(self, query: str) -> List[str]: """ Query Wikipedia with given text query :param query: text to query in Wikipedia :return list of links found for query """ search_results = self.wikipedia.search(query, results=self.n) return [t.replace(' ', '_') for t in search_results] def get_entities(self, text: str) -> List[str]: """ Get the list of named entities for given text. COMMENT: We should reinitialize this method for using another NER model :param text: str, text used for NER extraction :return list of str (entities found in text) """ sentence = Sentence(text) self.tagger.predict(sentence) entities = [] for entity in sentence.get_spans('ner'): entities.append(entity.text) return entities def get_wiki_candidates_NER(self, query: str) -> Set[str]: """ Method to get the Wikipedia articles candidates with use of NER model :param query: str query claim :return set of links found for query """ self.profiler.start_measure_local('NER_model') entities = self.get_entities(query) self.profiler.finish_measure_local() self.profiler.start_measure_local('wiki_search') search_results = self.get_wiki_candidates_raw(query) if not self.separate: search_results_en = self.get_wiki_candidates_raw( ' '.join(entities)) else: search_results_en = [] for e in entities: search_results_en += self.get_wiki_candidates_raw(e) self.profiler.finish_measure_local() return set([t for t in search_results + search_results_en]) def get_wiki_texts(self, articles_names: Set[str]) -> Dict: """ Method that gets Wikipedia texts for given articles names if exist :param articles_names set of names for Wikipedia articles. :return the dict with article names as keys and list of related sentences as values """ result = {} for name in articles_names: try: page = self.wikipedia.page(name) result[name] = page.summary.replace('\n', ' ').split('. ') except Exception as e: self.logger.warning( f"[Candidates picker] Page for id {name} is not found.") return result def get_candidates(self, claim: str) -> Dict: """ The main method of the class that get the Wikipedia texts for related articles for given query :param claim: str query claim :return the dict with article names as keys and list of related sentences as values """ candidates = self.get_wiki_candidates_NER(claim) self.logger.info( f"[Candidates picker] Candidates found: {', '.join(candidates)}") self.profiler.start_measure_local('wiki_texts') texts_dict = self.get_wiki_texts(candidates) self.profiler.finish_measure_local() return texts_dict
opera_garnier[0]['geometry']['location']['lng']) # search = wikipedia.page('Rue Scribe Paris') # content = search.content opera_garnier_content = "La rue Scribe est une voie du 9e arrondissement de Paris.\n\n\n== Situation et accès ==\nLa rue Scribe est située dans le 9e arrondissement de Paris, elle commence boulevard des Capucines, se développe vers le nord-nord-est, croise la rue Auber, longe l'opéra Garnier et la place Charles-Garnier, rejoint la rue des Mathurins au niveau de la place Diaghilev ; elle est prolongée au-delà du boulevard Haussmann par la rue de Mogador.\nElle fait partie de la re-composition de la Chaussée-d'Antin entreprise au XIXe siècle et qui culmine avec les transformations de Paris sous le Second Empire et la construction de l'opéra Garnier.\nCe site est desservi par les stations de métro Opéra et Chaussée d'Antin - La Fayette et par la station de RER Auber sur la ligne \u2009\u200d.\n\n\n== Origine du nom ==\nLa rue Scribe est nommée d'après l'auteur dramatique Eugène Scribe (1791-1861), en raison de son voisinage avec l'opéra Garnier, par décret du 2 mars 1864.\n\n\n== Histoire ==\nLa rue Scribe a été ouverte par décret du 29 septembre 1860, entre le boulevard des Capucines et la rue des Mathurins. La largeur de la partie comprise entre le boulevard des Capucines et la rue Auber a été portée à 22 m ; le décret ne prévoyait que 20 m.\nLa partie comprise entre la rue des Mathurins et le boulevard Haussmann a été incorporée à la place Diaghilev.\n\n\n== Bâtiments remarquables et lieux de mémoire ==\n\nNo 1 : anciens locaux du Jockey Club de Paris.\nNos 2 à 6 : arrière du Grand Hôtel (désormais InterContinental Paris Le Grand), dont l'entrée est située sur la place de l'Opéra ; les façades et toitures font l'objet d'une inscription au titre des monuments historiques depuis le 22 août 1975 ; l'immeuble est en outre assujetti à la servitude d'architecture prévue par le décret du 29 septembre 1860 pour les abords de l'Opéra.\nNos 6, 11, 11 bis, 15 et 17 : les façades et les toitures sur rue des immeubles aux abords de l'Opéra font l'objet d'une inscription au titre des monuments historiques depuis le 30 décembre 1977 ; les immeubles sont en outre assujettis à la servitude d'architecture prévue par le décret du 29 septembre 1860 pour les abords de l'Opéra.\nNo 31 : domicile d'Angelo Mariani.\nLa bouche de métro de la station Opéra dessinée en 1900 par l'architecte Hector Guimard pour la Compagnie du chemin de fer métropolitain de Paris et mise en place en 1904 au coin de la rue Auber, fait l'objet d'une inscription au titre des monuments historiques depuis le 29 mai 1978, ainsi que d'un label « Patrimoine du XXe siècle ».\nLe théâtre national de l'Opéra, construit par Charles Garnier. La rue Scribe longe la façade ouest, du côté du pavillon de l’Empereur (appelé après la chute du Second Empire « pavillon du chef de l'État ») et de la bibliothèque-musée de l'Opéra.\n\n\n== Notes et références ==\n\n\n== Annexes ==\n\n\n=== Lien externe ===\n(fr) Nomenclature officielle Portail de Paris Portail de la route" formatted_address = geocode_result[0]['formatted_address'] location = (geocode_result[0]['geometry']['location']['lat'], geocode_result[0]['geometry']['location']['lng']) print(formatted_address) from mediawiki import MediaWiki wikipedia = MediaWiki() wikipedia.language = "fr" search = wikipedia.page('Citée Parad') content = search.content search = wikipedia.search('washington') print(search) # search = wikipedia.opensearch('washington') # print(search) # p = wikipedia.page(search[0]) # print('toto') # p.title # p.summary # p.categories # p.images # p.links