Exemplo n.º 1
0
def main(args):
    files = glob.glob(args.filepattern)
    id_wiki = Wikipedia(language='id')
    en_wiki = Wikipedia(language='en', extract_format=args.format)
    for corpus in files:
        print(corpus)
        if os.path.isfile(corpus):
            _, fname = os.path.split(corpus)
            if args.output_dir and os.path.isdir(args.output_dir):
                output_file = os.path.join(args.output_dir, fname)
                mode = 'w+'
                print(output_file)
                if os.path.exists(output_file) and args.duplicate_append:
                    print('file exists')
                    mode = 'a'
                fileout = codecs.open(output_file, mode=mode, encoding=args.encoding)
            else:
                fileout = sys.stdout
            data = codecs.open(corpus, mode='r', encoding=args.encoding)
            for title in get_jsonlpage(data):
                page = id_wiki.page(title)
                print(title)
                try:
                    # print(page.langlinks)
                    if 'en' in page.langlinks:
                        en_title = page.langlinks['en'].title
                        en_page = en_wiki.page(en_title)
                        print(en_title)
                        # print(en_page.text)
                        en_text = print_page(en_page, args.format)
                        print(en_text, file=fileout)
                except Exception:
                    continue
Exemplo n.º 2
0
class Wikipedia:
    def __init__(self, title):
        self.wiki = Wiki('ru')
        self.title = title

    def page(self):
        page = self.wiki.page(self.title)
        if not page.exists():
            page = self
            setattr(page, 'sections', [])
        return page

    def summary(self):
        page = self.page()
        if page.sections != []:
            return {'Общая информация': page.summary}

    def parse_sections(self, sections, summary=None):
        info = {}

        if summary is not None:
            info.update(summary)

        for section in sections:
            if section.text is '':
                value = self.parse_sections(section.sections)
            else:
                value = section.text
            info[section.title] = value
        return info

    def sections(self):
        return self.parse_sections(self.page().sections, self.summary())
Exemplo n.º 3
0
def dump_page(source: str,
              target_folder: Union[Path, str] = "pages",
              wiki_obj: Wikipedia = None,
              lang: str = 'fr'):
    if not wiki_obj:
        wiki_obj = Wikipedia(lang)

    target_folder = Path(target_folder)
    if not target_folder.exists():
        makedirs(target_folder)

    wikipage = wiki_obj.page(source)
    if not wikipage.exists():
        print(f"page {source} does not exist")

    else:
        page_info = wiki_obj.info(wikipage)
        if page_info.title != wikipage.title:
            wikipage = wiki_obj.page(page_info.title)
        wiki_title = wikipage.title.replace(' ', '_')
        target_file = target_folder / (wiki_title.replace("/", "__SLASH__") +
                                       ".pkl")
        pkl.dump(wikipage, target_file.open('wb'))
Exemplo n.º 4
0
def get_company_details(company):
    """
        Params: 
            - company (str)
        Returns:
            - company_description (str)
    """
    wiki_wiki = Wikipedia('en')

    try:
        # try different methods for searching  for the company until something good is returned
        page = wiki_wiki.page(company + " (company)")

        if not page.exists():
            page = wiki_wiki.page(company)
    except Exception as err:
        printColoured(err, colour="red")
        raise InvalidUserInput(
            description="Connection timed out. Please try again later")

    company_data = page.text
    company_description = company_data.split("\n")[0]
    return company_description
Exemplo n.º 5
0
    def main(self):
        from aion_core.utils import remove_brackets, remove_space, remove_string_sequence
        from wikipediaapi import Wikipedia

        splitted_acph = self.activate_phrase.split("__and__")
        searched_article = remove_string_sequence(self.speech_input, splitted_acph[0], splitted_acph[-1])
        wiki = Wikipedia(aconf.Aion().get_language().split("_")[0])
        article = wiki.page(searched_article)

        if article.exists():
            article_text = remove_brackets(article.summary)
            article_text = remove_space(article_text)
            self.speech_output(alang.start("skills", "wikipedia", {"article_text": article_text}))
        else:
            self.speech_output(alang.start("skills", "wikipedia_article_not_found", {"article_name": searched_article}))
Exemplo n.º 6
0
def process_page(
    nlp: spacy.Language,
    category: Text,
    wiki: wikipediaapi.Wikipedia,
    page: Text,
) -> bool:
    """Fetches a single page and creates index files."""
    filename = os.path.join("pages", f"{page}.sentences.json")
    output_filename = filename.replace(".sentences.", ".index.")
    if not os.path.exists(filename):
        article = wiki.page(page)
        summary = wikipediaapi.WikipediaPageSection(wiki=wiki,
                                                    title='Summary',
                                                    text=article.summary)
        sections = [summary] + article.sections
        sentences = [
            dict(id=id, **sentence)
            for id, sentence in enumerate(extract_sections(nlp, sections))
        ]
        if any(refer in sentences[0]["line"].lower() for refer in kREFER):
            return False
        with open(filename, 'w') as outfile:
            json.dump(
                {
                    "category": category,
                    "title": page,
                    "sentences": sentences
                },
                outfile,
                indent=2)
    if not os.path.exists(output_filename):
        command = f'node util/single_index.js "{filename}" "{output_filename}"'
        subprocess.call(command, shell=True)

    bucket = fibs_firebase_config.get_bucket()
    blob = bucket.blob(f"pages/{page}.json")
    if not blob.exists():
        blob.upload_from_filename(filename, content_type='application/json')
        bucket.blob(f"indices/{page}.json").upload_from_filename(
            output_filename, content_type='application/json')
        return True
    return False
Exemplo n.º 7
0
def birthday_of_rich(id=1):
    '''
    Parsing Billionaire Birthdays
    '''
    wiki = Wikipedia()
    for id in Rich.select().where(
            Rich.id >= id):  #No data on the wikipedia site
        # print(id, id.name)
        no_bday = [
            'Qin Yinglin', 'Colin Zheng Huang', 'Zhong Huijuan',
            'Walter P.J. Droege', 'Li Xiting', 'Yang Huiyan', 'Joseph Safra',
            'Lukas Walton', 'Theo Albrecht, Jr.', 'Zhang Yiming',
            'Lee Man Tat', 'Wang Wei', 'Radhakishan Damani', 'Liu Yonghao',
            'Wu Yajun', 'Sun Piaoyang', 'Pang Kang', 'Udo Tschira', 'Xu Hang',
            'Pallonji Mistry', 'Zhang Yong', 'Robert Ng', 'Iris Fontbona',
            'Donald Newhouse', 'Graeme Hart', 'Goh Cheng Liang', 'Hank Meijer',
            'Robin Zeng', 'Andreas Struengmann', 'Thomas Struengmann',
            'Hui Wing Mau', 'Quek Leng Chan', 'Sun Hongbin', 'Zhang Bangxin',
            'Lu Zhongfang', 'Cyrus Poonawalla', 'Scott Farquhar',
            'Gong Hongjia', 'Eric Wittouck', 'Xu Shihui', 'Wang Wenyin',
            'Zhang Fan', 'Chen Bang', 'Jiang Rensheng', 'Blair Parry-Okeden',
            'David Duffield', 'Eyal Ofer', 'John Grayken'
        ]
        if id.name in no_bday:
            id.bday = datetime.datetime(1, 1, 1)
            id.save()
            continue
        page_py = wiki.page(id.name)
        link = page_py.fullurl
        response = get_response(link)
        html_doc = BeautifulSoup(response.text, features='html.parser')
        date = html_doc.find('span', {'class': 'bday'})
        if date is None:
            bday = fix_for_data(id.name)
        else:
            bday = datetime.datetime.strptime(date.text, '%Y-%m-%d')
        zodiac = find_zodiac(bday)
        id.bday = bday.date()
        id.zodiac = zodiac
        id.save()
Exemplo n.º 8
0
class Wiki(commands.Cog):
    """Error handling.
    """
    def __init__(self, bot):
        self.bot = bot
        self.wiki = Wikipedia('en')

    @commands.command(name='wiki', aliases=['page'])
    async def page(self, ctx, *, search):
        result = self.wiki.page(search)
        if not result.exists():
            return await ctx.send(embed=Embed(
                title="Page not found ⛔",
                description=f"No page was found under the name `{search}`",
                color=Color.blurple()))
        wiki = Embed(color=Color.dark_gold())
        wiki.title = result.title
        wiki.url = result.fullurl
        wiki.description = f'{result.text[0:500]}...'
        wiki.set_footer(
            text="Powered by Wikipedia",
            icon_url="https://i.ibb.co/jyX08CD/wikipedia-PNG39.png")
        wiki.timestamp = ctx.message.created_at
        return await ctx.send(embed=wiki)
Exemplo n.º 9
0
class Post:
    def __init__(self, team_names='', team_colors='',
                 team1_players='', team2_players='',
                 options='', header='', question='',
                 twTags=''):
        if team_names == '' or team_colors == '' or team1_players == '' or team2_players == '':
            return

        if options == '' or header == '' or question == '':
            return

        self.listitems = list(options)
        self.header = str(header)
        self.question = str(question)
        self.wiki = Wikipedia('en')
        self.team_names = list(team_names)
        self.team_colors = list(team_colors)
        self.team1_players = list(team1_players)
        self.team2_players = list(team2_players)
        self.twTags = list(twTags)

    def getWikiUrl(self, player_name=''):
        if player_name == '':
            return None

        page_py = self.wiki.page(player_name)
        if page_py.exists() is False:
            return None

        return page_py.fullurl

    @staticmethod
    def tag(name, *content, style=None, href=None, **attrs):
        if style is not None:
            attrs['style'] = style

        if href is not None:
            attrs['href'] = href

        if attrs:
            attr_str = ''.join(' %s="%s"' % (attr, value)
                               for attr, value
                               in sorted(attrs.items()))
        else:
            attr_str = ""

        if content:
            return '\n'.join('<%s%s>%s</%s>' %
                             (name, attr_str, c, name) for c in content)
        else:
            return '<%s%s />' % (name, attr_str)

    def formatApi(self):
        http_part = "http --auth : --form POST http://www.tactification.com/api_rt/v1.0/new_post "
        question_tag = self.tag('div', self.question, style='color:black')
        br1 = self.tag('br')

        li_items = str()
        for item in self.listitems:
            li_items += ''.join(self.tag('li', item))

        ul = self.tag('ul', li_items)
        div1 = self.tag('div', ul, style='color:black')

        starring_tag = self.tag('div', "Starring:", style='color:black')

        team1_url = self.getWikiUrl(self.team_names[0])
        if team1_url is None:
            print(self.team_names[0])
            return

        a_team1 = self.tag('a', self.team_names[0], href=team1_url) + ': '
        a_items = str()
        for item in self.team1_players:
            print(item)
            player_url = self.getWikiUrl(item[0])
            if player_url is None:
                print(item)
                return

            a_items += ''.join(self.tag('a', item[0], href=player_url) + '(' + str(item[1]) + '),')

        a_items.rstrip(',')

        i_team1 = self.tag('i', a_team1+a_items, style="color:" + str(self.team_colors[0]))
        team2_url = self.getWikiUrl(self.team_names[1])
        if team2_url is None:
            print(self.team_names[1])
            return

        a_team2 = self.tag('a', self.team_names[1], href=team2_url) + ': '
        a_items = str()
        for item in self.team2_players:
            player_url = self.getWikiUrl(item[0])
            if player_url is None:
                print(item)
                return

            a_items += ''.join(self.tag('a', item[0], href=player_url) + '(' + str(item[1]) + '),')

        a_items.rstrip(',')

        i_team2 = self.tag('i', a_team2+a_items, style="color:" + str(self.team_colors[1]))
        header = " header={!r} ".format(self.header)
        twTag = (" twTags='#{}, #{}, #{}' ".format(*self.twTags))
        end_part = "tactical_gif@home_img.jpg tactical_pic_1750@with_help_msg.jpg tactical_pic_1575@with_help_msg_75.jpg tactical_pic_875@with_help_msg_50.jpg"
        final_command = http_part + "body='" + question_tag + br1 + div1 + starring_tag + i_team1 + br1 + i_team2 + "'" + header + twTag + end_part
        print(final_command)
Exemplo n.º 10
0
        "Л": 0,
        "М": 0,
        "Н": 0,
        "О": 0,
        "П": 0,
        "Р": 0,
        "С": 0,
        "Т": 0,
        "У": 0,
        "Ф": 0,
        "Х": 0,
        "Ц": 0,
        "Ч": 0,
        "Ш": 0,
        "Щ": 0,
        "Э": 0,
        "Ю": 0,
        "Я": 0
    }
    for name in names_list:
        alphabet_dict[name[:1].upper()] += 1
    return alphabet_dict


wiki_wiki = Wikipedia('ru')
members = wiki_wiki.page('Категория:Животные по алфавиту').categorymembers
animal_names = get_russian_category_members(members)
counted_names = count_names(animal_names)
for key, value in counted_names.items():
    print(f'{key}: {value}')
Exemplo n.º 11
0
from wikipediaapi import Wikipedia

player_name = input("Enter player name: ")
wiki = Wikipedia(language='en')
page = wiki.page(player_name)

for s in page.sections:
    if 'Club career' in s.title:
        print(s.title)
        for p in s.sections:
            print("\t"+p.title)
            if p.sections != None:
                for q in p.sections:
                    print("\t\t"+q.title)