def main(args): files = glob.glob(args.filepattern) id_wiki = Wikipedia(language='id') en_wiki = Wikipedia(language='en', extract_format=args.format) for corpus in files: print(corpus) if os.path.isfile(corpus): _, fname = os.path.split(corpus) if args.output_dir and os.path.isdir(args.output_dir): output_file = os.path.join(args.output_dir, fname) mode = 'w+' print(output_file) if os.path.exists(output_file) and args.duplicate_append: print('file exists') mode = 'a' fileout = codecs.open(output_file, mode=mode, encoding=args.encoding) else: fileout = sys.stdout data = codecs.open(corpus, mode='r', encoding=args.encoding) for title in get_jsonlpage(data): page = id_wiki.page(title) print(title) try: # print(page.langlinks) if 'en' in page.langlinks: en_title = page.langlinks['en'].title en_page = en_wiki.page(en_title) print(en_title) # print(en_page.text) en_text = print_page(en_page, args.format) print(en_text, file=fileout) except Exception: continue
class Wikipedia: def __init__(self, title): self.wiki = Wiki('ru') self.title = title def page(self): page = self.wiki.page(self.title) if not page.exists(): page = self setattr(page, 'sections', []) return page def summary(self): page = self.page() if page.sections != []: return {'Общая информация': page.summary} def parse_sections(self, sections, summary=None): info = {} if summary is not None: info.update(summary) for section in sections: if section.text is '': value = self.parse_sections(section.sections) else: value = section.text info[section.title] = value return info def sections(self): return self.parse_sections(self.page().sections, self.summary())
def dump_page(source: str, target_folder: Union[Path, str] = "pages", wiki_obj: Wikipedia = None, lang: str = 'fr'): if not wiki_obj: wiki_obj = Wikipedia(lang) target_folder = Path(target_folder) if not target_folder.exists(): makedirs(target_folder) wikipage = wiki_obj.page(source) if not wikipage.exists(): print(f"page {source} does not exist") else: page_info = wiki_obj.info(wikipage) if page_info.title != wikipage.title: wikipage = wiki_obj.page(page_info.title) wiki_title = wikipage.title.replace(' ', '_') target_file = target_folder / (wiki_title.replace("/", "__SLASH__") + ".pkl") pkl.dump(wikipage, target_file.open('wb'))
def get_company_details(company): """ Params: - company (str) Returns: - company_description (str) """ wiki_wiki = Wikipedia('en') try: # try different methods for searching for the company until something good is returned page = wiki_wiki.page(company + " (company)") if not page.exists(): page = wiki_wiki.page(company) except Exception as err: printColoured(err, colour="red") raise InvalidUserInput( description="Connection timed out. Please try again later") company_data = page.text company_description = company_data.split("\n")[0] return company_description
def main(self): from aion_core.utils import remove_brackets, remove_space, remove_string_sequence from wikipediaapi import Wikipedia splitted_acph = self.activate_phrase.split("__and__") searched_article = remove_string_sequence(self.speech_input, splitted_acph[0], splitted_acph[-1]) wiki = Wikipedia(aconf.Aion().get_language().split("_")[0]) article = wiki.page(searched_article) if article.exists(): article_text = remove_brackets(article.summary) article_text = remove_space(article_text) self.speech_output(alang.start("skills", "wikipedia", {"article_text": article_text})) else: self.speech_output(alang.start("skills", "wikipedia_article_not_found", {"article_name": searched_article}))
def process_page( nlp: spacy.Language, category: Text, wiki: wikipediaapi.Wikipedia, page: Text, ) -> bool: """Fetches a single page and creates index files.""" filename = os.path.join("pages", f"{page}.sentences.json") output_filename = filename.replace(".sentences.", ".index.") if not os.path.exists(filename): article = wiki.page(page) summary = wikipediaapi.WikipediaPageSection(wiki=wiki, title='Summary', text=article.summary) sections = [summary] + article.sections sentences = [ dict(id=id, **sentence) for id, sentence in enumerate(extract_sections(nlp, sections)) ] if any(refer in sentences[0]["line"].lower() for refer in kREFER): return False with open(filename, 'w') as outfile: json.dump( { "category": category, "title": page, "sentences": sentences }, outfile, indent=2) if not os.path.exists(output_filename): command = f'node util/single_index.js "{filename}" "{output_filename}"' subprocess.call(command, shell=True) bucket = fibs_firebase_config.get_bucket() blob = bucket.blob(f"pages/{page}.json") if not blob.exists(): blob.upload_from_filename(filename, content_type='application/json') bucket.blob(f"indices/{page}.json").upload_from_filename( output_filename, content_type='application/json') return True return False
def birthday_of_rich(id=1): ''' Parsing Billionaire Birthdays ''' wiki = Wikipedia() for id in Rich.select().where( Rich.id >= id): #No data on the wikipedia site # print(id, id.name) no_bday = [ 'Qin Yinglin', 'Colin Zheng Huang', 'Zhong Huijuan', 'Walter P.J. Droege', 'Li Xiting', 'Yang Huiyan', 'Joseph Safra', 'Lukas Walton', 'Theo Albrecht, Jr.', 'Zhang Yiming', 'Lee Man Tat', 'Wang Wei', 'Radhakishan Damani', 'Liu Yonghao', 'Wu Yajun', 'Sun Piaoyang', 'Pang Kang', 'Udo Tschira', 'Xu Hang', 'Pallonji Mistry', 'Zhang Yong', 'Robert Ng', 'Iris Fontbona', 'Donald Newhouse', 'Graeme Hart', 'Goh Cheng Liang', 'Hank Meijer', 'Robin Zeng', 'Andreas Struengmann', 'Thomas Struengmann', 'Hui Wing Mau', 'Quek Leng Chan', 'Sun Hongbin', 'Zhang Bangxin', 'Lu Zhongfang', 'Cyrus Poonawalla', 'Scott Farquhar', 'Gong Hongjia', 'Eric Wittouck', 'Xu Shihui', 'Wang Wenyin', 'Zhang Fan', 'Chen Bang', 'Jiang Rensheng', 'Blair Parry-Okeden', 'David Duffield', 'Eyal Ofer', 'John Grayken' ] if id.name in no_bday: id.bday = datetime.datetime(1, 1, 1) id.save() continue page_py = wiki.page(id.name) link = page_py.fullurl response = get_response(link) html_doc = BeautifulSoup(response.text, features='html.parser') date = html_doc.find('span', {'class': 'bday'}) if date is None: bday = fix_for_data(id.name) else: bday = datetime.datetime.strptime(date.text, '%Y-%m-%d') zodiac = find_zodiac(bday) id.bday = bday.date() id.zodiac = zodiac id.save()
class Wiki(commands.Cog): """Error handling. """ def __init__(self, bot): self.bot = bot self.wiki = Wikipedia('en') @commands.command(name='wiki', aliases=['page']) async def page(self, ctx, *, search): result = self.wiki.page(search) if not result.exists(): return await ctx.send(embed=Embed( title="Page not found ⛔", description=f"No page was found under the name `{search}`", color=Color.blurple())) wiki = Embed(color=Color.dark_gold()) wiki.title = result.title wiki.url = result.fullurl wiki.description = f'{result.text[0:500]}...' wiki.set_footer( text="Powered by Wikipedia", icon_url="https://i.ibb.co/jyX08CD/wikipedia-PNG39.png") wiki.timestamp = ctx.message.created_at return await ctx.send(embed=wiki)
class Post: def __init__(self, team_names='', team_colors='', team1_players='', team2_players='', options='', header='', question='', twTags=''): if team_names == '' or team_colors == '' or team1_players == '' or team2_players == '': return if options == '' or header == '' or question == '': return self.listitems = list(options) self.header = str(header) self.question = str(question) self.wiki = Wikipedia('en') self.team_names = list(team_names) self.team_colors = list(team_colors) self.team1_players = list(team1_players) self.team2_players = list(team2_players) self.twTags = list(twTags) def getWikiUrl(self, player_name=''): if player_name == '': return None page_py = self.wiki.page(player_name) if page_py.exists() is False: return None return page_py.fullurl @staticmethod def tag(name, *content, style=None, href=None, **attrs): if style is not None: attrs['style'] = style if href is not None: attrs['href'] = href if attrs: attr_str = ''.join(' %s="%s"' % (attr, value) for attr, value in sorted(attrs.items())) else: attr_str = "" if content: return '\n'.join('<%s%s>%s</%s>' % (name, attr_str, c, name) for c in content) else: return '<%s%s />' % (name, attr_str) def formatApi(self): http_part = "http --auth : --form POST http://www.tactification.com/api_rt/v1.0/new_post " question_tag = self.tag('div', self.question, style='color:black') br1 = self.tag('br') li_items = str() for item in self.listitems: li_items += ''.join(self.tag('li', item)) ul = self.tag('ul', li_items) div1 = self.tag('div', ul, style='color:black') starring_tag = self.tag('div', "Starring:", style='color:black') team1_url = self.getWikiUrl(self.team_names[0]) if team1_url is None: print(self.team_names[0]) return a_team1 = self.tag('a', self.team_names[0], href=team1_url) + ': ' a_items = str() for item in self.team1_players: print(item) player_url = self.getWikiUrl(item[0]) if player_url is None: print(item) return a_items += ''.join(self.tag('a', item[0], href=player_url) + '(' + str(item[1]) + '),') a_items.rstrip(',') i_team1 = self.tag('i', a_team1+a_items, style="color:" + str(self.team_colors[0])) team2_url = self.getWikiUrl(self.team_names[1]) if team2_url is None: print(self.team_names[1]) return a_team2 = self.tag('a', self.team_names[1], href=team2_url) + ': ' a_items = str() for item in self.team2_players: player_url = self.getWikiUrl(item[0]) if player_url is None: print(item) return a_items += ''.join(self.tag('a', item[0], href=player_url) + '(' + str(item[1]) + '),') a_items.rstrip(',') i_team2 = self.tag('i', a_team2+a_items, style="color:" + str(self.team_colors[1])) header = " header={!r} ".format(self.header) twTag = (" twTags='#{}, #{}, #{}' ".format(*self.twTags)) end_part = "tactical_gif@home_img.jpg tactical_pic_1750@with_help_msg.jpg tactical_pic_1575@with_help_msg_75.jpg tactical_pic_875@with_help_msg_50.jpg" final_command = http_part + "body='" + question_tag + br1 + div1 + starring_tag + i_team1 + br1 + i_team2 + "'" + header + twTag + end_part print(final_command)
"Л": 0, "М": 0, "Н": 0, "О": 0, "П": 0, "Р": 0, "С": 0, "Т": 0, "У": 0, "Ф": 0, "Х": 0, "Ц": 0, "Ч": 0, "Ш": 0, "Щ": 0, "Э": 0, "Ю": 0, "Я": 0 } for name in names_list: alphabet_dict[name[:1].upper()] += 1 return alphabet_dict wiki_wiki = Wikipedia('ru') members = wiki_wiki.page('Категория:Животные по алфавиту').categorymembers animal_names = get_russian_category_members(members) counted_names = count_names(animal_names) for key, value in counted_names.items(): print(f'{key}: {value}')
from wikipediaapi import Wikipedia player_name = input("Enter player name: ") wiki = Wikipedia(language='en') page = wiki.page(player_name) for s in page.sections: if 'Club career' in s.title: print(s.title) for p in s.sections: print("\t"+p.title) if p.sections != None: for q in p.sections: print("\t\t"+q.title)