def extract_Wikipage_from_title(wiki_title, verbose=True):
    try:
        wiki_text = wikipedia.WikipediaPage(wiki_title).content
        return wiki_text
    except:
        return []
        if verbose:
            print(wiki_title, 'not found.')
Exemplo n.º 2
0
 def __get_wiki_page(self):
     try:
         return wiki.page(self.search, auto_suggest=True)
     except PageError:
         print(
             'No Page Found. Try adding a year to your query or check if information is correct.'
         )
         return wiki.WikipediaPage('Main_Page')
Exemplo n.º 3
0
def find_latin_name(nom_fr):
    """
    Fonction permettant à partir d'un nom commun d'espèce en français de ressortir un dictionnaire avec pour clé le nom commun et comme valeurs:
    - le nom commun français
    - le lien wikipedia de ce nom commun
    - son rang taxonomique
    - le nom latin
    """

    global u
    dico_fr = dict()
    wikipedia.set_lang("fr")
    key = nom_fr
    search = wikipedia.search(nom_fr)
    if search == list():
        taxon = "taxon non trouvé"
        nom_latin = "nom latin non trouvé"
        url = "https://fr.wikipedia.org/"
        dico_fr[key] = [key, url, taxon, nom_latin]
        return dico_fr

    search = search[0]
    wiki = wikipedia.WikipediaPage(search)
    url = wiki.url

    requete = requests.get(url)
    page = requete.content
    soup = BeautifulSoup(page, features="lxml")
    if soup.find("div", {"class": "center taxobox_classification"}) is None:
        nom_latin = "nom latin non trouvé"
    else:
        nom_latin = soup.find("div", {
            "class": "center taxobox_classification"
        }).text
    for count, i in enumerate(nom_latin):
        u = int()
        if count == 0:
            continue
        elif i == " ":
            continue
        elif i == "'":
            continue
        elif i == ".":
            continue
        elif i.upper() == i:
            u = count
            break
        else:
            u = 20
    nom_latin = nom_latin[:u]
    if soup.find("p", {"class": "bloc"}) is None:
        taxon = "taxon non trouvé"
    else:
        taxon = soup.find("p", {"class": "bloc"}).text

    # création du dictionnaire de sortie
    dico_fr[key] = [key, url, taxon, nom_latin]
    return dico_fr
Exemplo n.º 4
0
def printContent(page_title):
    # page_title = "Information_retrieval"
    url_wiki = urllib2.unquote(page_title).decode('utf8')
    wiki_main = "https://en.wikipedia.org"
    folder = "/wiki/" + page_title
    data_read = urllib2.urlopen(wiki_main + folder).read()
    soup = BeautifulSoup(data_read, 'html.parser')
    wiki_content_py = wikipedia.WikipediaPage(url_wiki).content.encode("utf-8")
    div_id_content = soup.find("div", {"class": "mw-parser-output"})

    try:
        div_id_content.find("div", {"id": "toc"}).decompose()
    except AttributeError:
        pass
    try:
        div_id_content.find("table", {
            "class": "vertical-navbox nowraplinks plainlist"
        }).decompose()
    except AttributeError:
        pass
    try:
        div_id_content.find(
            "table", {
                "class":
                "plainlinks metadata ambox ambox-content ambox-multiple_issues compact-ambox"
            }).decompose()
    except AttributeError:
        pass
    try:
        div_id_content.find("table", {"role": "presentation"}).decompose()
    except AttributeError:
        pass

    if div_id_content is not None:
        a_tags = div_id_content.findAll('a', href=True)
        wiki_links_page_title = list()
        for a_tag in a_tags:
            if a_tag['href'][0:6] == "/wiki/" and len(
                    a_tag.text.encode("utf-8")) > 2:
                wiki_links_page_title.append(a_tag)

        start_index = 0
        for link in wiki_links_page_title:
            text = link.text.encode("utf-8")
            link = link['href'][6:].encode("utf-8")
            found_at = wiki_content_py.find(text, start_index)
            if found_at != -1:
                wiki_content_py = wiki_content_py[:found_at] + "[[" + link + "||" + wiki_content_py[
                    found_at:found_at +
                    len(text)] + "]]" + wiki_content_py[found_at + len(text):]
            start_index = found_at + 4 + len(link)

    lines = wiki_content_py.splitlines()

    for line in lines:
        if count_aphabets(line) > 3 and not (line.find("{") != -1
                                             and line.find("}") != -1):
            print line
Exemplo n.º 5
0
def make_card(user_input):
    try:
        pg = wikipedia.WikipediaPage(title=user_input)
    except:
        p = wikipedia.search(query=user_input, suggestion=True)
        user_input = p[0][0]
        pg = wikipedia.WikipediaPage(title=user_input)

    try:
        pghtml = pg.html()
        soup = BeautifulSoup(pghtml, 'html.parser')
        table = soup.table
        # row = table.findAll('tr')
        heading = table.findAll('th')
        data = table.findAll('td')
        # heads = table.findAll('tr')
    except:
        return " "
    info_box = []

    for h, d in zip(heading[:7], data[:7]):
        info_box.append(h.get_text() + ':  ' + d.get_text())

    # for head in heads[:7]:
    # info_box.append(head.get_text())

    def removeNestedParentheses(s):
        ret = ''
        skip = 0
        for i in s:
            if i == '[':
                skip += 1
            elif i == ']' and skip > 0:
                skip -= 1
            elif skip == 0:
                ret += i
        return ret

    print(len(info_box))
    if len(info_box) > 2:
        for x in range(len(info_box)):
            info_box[x] = removeNestedParentheses(info_box[x])
        return info_box[1:]
    else:
        return " "
Exemplo n.º 6
0
def get_image(user_input):
    pg = wikipedia.WikipediaPage(title=user_input)
    html_page = pg.html()
    bs = BeautifulSoup(html_page, 'html.parser')
    use_less = "//upload.wikimedia.org/wikipedia/commons/thumb/9/98/Ambox_current_red.svg/42px-Ambox_current_red.svg.png"
    image = bs.findAll('img', height=True)[0]
    if (image.get('src') == use_less):
        image = bs.findAll('img')[1]
    return "https://" + image.get('src')[2:]
Exemplo n.º 7
0
def get_paragraphs(question: str):
    search_results = wikipedia.search(question)
    if (search_results is None):
        print("couldn't find results!")
        return None
    target = search_results[0]
    page = wikipedia.WikipediaPage(title=target)
    contents = page.content
    return get_five_paragraphs(contents)
Exemplo n.º 8
0
def scraping(title,movies):    
    intro=[]
    plot=[]
    plot_s=[]
    story=[]
    import wikipedia
    for i in range(len(movies)):
        try:
            intro.append(wikipedia.WikipediaPage(title = title[i]).summary)
            plot.append(wikipedia.WikipediaPage(title[i]).section('Plot'))
            plot_s.append(wikipedia.WikipediaPage(title[i]).section('Plot summary'))
            print("Current progress",np.round(i/len(title)*100,2),"%")
            print(datetime.datetime.now().time())
        except wikipedia.DisambiguationError: 
            #in case it finds a DisambiguationPage he appenda to intro,plot,plot_s: ambiguos
              intro.append('ambiguos')
              plot.append('ambiguos')
              plot_s.append('ambiguos')
Exemplo n.º 9
0
def getDOB(title):
    DOB = ''
    html = BeautifulSoup(wikipedia.WikipediaPage(title).html(), 'html.parser')
    try:
        DOB = html.find('span', class_="bday").string
    except:
        DOB = html.find('span', class_="dtstart bday").string

    return DOB
Exemplo n.º 10
0
def grab_content(page_id, clean=True):
    try:
        page_content = wikipedia.WikipediaPage(pageid=page_id).content
    except:
        page_content = ''
    if clean:
        return cleaner(page_content)
    else:
        return page_content
Exemplo n.º 11
0
def get_title(title):
    try:
        results, suggestion = wikipedia.search(title,
                                               results=1,
                                               suggestion=True)
        title = suggestion or results[0]
        return wikipedia.WikipediaPage(title)
    except:
        return None
Exemplo n.º 12
0
async def wiki(query):
    'wikipedia'
    if query.startswith('-'):
        lang, search = query.split(None, 1)
        lang = lang[1:]
        try:
            wikipedia.set_lang(lang)
            out_message = wikipedia.WikipediaPage(wikipedia.search(search)[0]).content
        except wikipedia.DisambiguationError as e:
            out_message = str(e)

    try:
        #out_message = wikipedia.summary(t.group(2), sentences=30)
        #out_message = wikipedia.WikipediaPage(query).content
        out_message = wikipedia.WikipediaPage(wikipedia.search(query)[0]).content
    except wikipedia.DisambiguationError as e:
        out_message = str(e)
    return out_message
def get_tfidf_doc(docTitle):
    sortedTitles = tfidf.tfidf(docTitle, nTitles=20)

    documents = list()
    for title in sortedTitles:
        wikisearch = wikipedia.WikipediaPage(title[0])
        wikicontent = wikisearch.links
        documents.append(wikicontent)
    return sortedTitles, documents
Exemplo n.º 14
0
def wiki(celestial_object):
    ans = celestial_object
    cwd = os.getcwd()
    with open(os.path.join(cwd, 'display_info.yml'), 'r') as stream:
        all_display_statistics = load(stream, Loader=SafeLoader)

    req_statistics = all_display_statistics.get(ans, {})

    if ans in ["spiral", "elliptical"]:
        print("--------------------------------------------------------")
        print("Classified Celestial Object is {} Galaxy : ".format(
            ans.capitalize()))
        print("-------------------------------------------------------- \n")
        # print(wikipedia.summary("Spiral Galaxy", sentences=2))
        print(wikipedia.WikipediaPage(title='{} galaxy'.format(ans)).summary)
    elif ans in [
            'mercury', 'venus', 'earth', 'mars', 'jupiter', 'saturn', 'uranus',
            'neptune'
    ]:
        print("--------------------------------------------------------")
        print("Classified Celestial Object is {} Planet : ".format(
            ans.capitalize()))
        print("-------------------------------------------------------- \n")
        statistics = "\n".join([
            '-- {}: {}'.format(parameter, value)
            for parameter, value in req_statistics.items()
        ])
        print("{}\n\n".format(statistics))
        # print(wikipedia.summary("Mercury (planet)", sentences=2))
        print(wikipedia.WikipediaPage(title='{} (planet)'.format(ans)).summary)
    elif ans in [
            'moon', 'stars', 'nebula', 'supernova', 'cluster_of_galaxies'
    ]:
        print("--------------------------------------------------------")
        print("Classified Celestial Object is the {} : ".format(
            ans.capitalize()))
        print("-------------------------------------------------------- \n")
        statistics = "\n".join([
            '-- {}: {}'.format(parameter, value)
            for parameter, value in req_statistics.items()
        ])
        print("{}\n\n".format(statistics))
        print(wikipedia.WikipediaPage(title='{}'.format(ans)).summary)
    return " "
def main():
    wiki_article_sentences = {}
    selected_key_concept = "Natural language processing"
    wiki_article_sentences[selected_key_concept] = [
        'Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora.',
        ' Natural language processing is used for various purposes.'
    ]
    sentences = wiki_article_sentences[selected_key_concept]

    dir = os.path.join(folderpath, "Definition/Model")
    p = re.compile('\([^()]*\)')
    test_obj = testing(dir)
    test_obj.load_model()

    check_def_in_text = False
    defnitions = []
    for sent in sentences:
        orig_sent = sent
        if 10 < len(
                sent
        ):  # To remove sentences with single word or very few words or which are empty
            processed_1 = p.sub('', sent)
            processed_2 = remove_non_ascii(processed_1)
            processed_3 = processed_2.replace('[', '').replace(']',
                                                               '').replace(
                                                                   ';', ',')
            result = test_obj.predic_classes([processed_3])

            if result[0][0] == 1:
                defnitions.append(orig_sent)
                check_def_in_text = True
                print("Definition found")
    check_def_in_wiki = False
    if check_def_in_text == False:
        check_def_in_wiki = False
        defnitions = []
        obj = wikipedia.WikipediaPage(selected_key_concept)
        summarised_text = obj.summary
        wikipedia_sentences = nltk.sent_tokenize(summarised_text)
        for sent in wikipedia_sentences:
            orig_sent = sent
            if 10 < len(
                    sent
            ):  # To remove sentences with single word or very few words or which are empty
                processed_1 = p.sub('', sent)
                processed_2 = remove_non_ascii(processed_1)
                processed_3 = processed_2.replace('[',
                                                  '').replace(']', '').replace(
                                                      ';', ',')
                result = test_obj.predic_classes([processed_3])

                if result[0][0] == 1:
                    definition_found = orig_sent
                    check_def_in_wiki = True
                    defnitions.append(orig_sent)
    print(defnitions)
Exemplo n.º 16
0
    def get_summary(self, queries, prev_summary=None, prev_failed=None):

        if prev_summary and prev_failed:
            log.info('Reusing previous summaries.')

            pd_prev_summaries = pd.read_csv(prev_summary,
                                            sep='\t',
                                            keep_default_na=False)
            pd_prev_failed = pd.read_csv(prev_failed,
                                         sep='\t',
                                         keep_default_na=False)

            known_successful_queries = pd_prev_summaries['query'].tolist()
            known_failed_queries = pd_prev_failed['query'].tolist()
            known_queries = known_successful_queries + known_failed_queries

            depracated_queries = [q for q in known_queries if q not in queries]
            queries = [q for q in queries if q not in known_queries]

            log.info('Found %d deprecated queries', len(depracated_queries))
            log.info('Found %d new queries', len(queries))

        summaries = []
        failed_queries = []

        num_queries = len(queries)
        log_every = [i * int(num_queries / NUM_LOGS) for i in range(NUM_LOGS)]

        for idx, query in enumerate(queries):
            if idx in log_every:
                log.info('Processing query {}/{}'.format(idx, num_queries))

            try:
                summary = wikipedia.WikipediaPage(query).content.replace(
                    '\n', ' ')
                summaries.append([query, summary])
            except:
                failed_queries.append([query])

        pd_summaries = pd.DataFrame(summaries, columns=['query', 'summary'])
        pd_failed = pd.DataFrame(failed_queries, columns=['query'])

        if prev_summary and prev_failed:
            pd_summaries = pd_summaries.append(pd_prev_summaries)
            pd_failed = pd_failed.append(pd_prev_failed)

            pd_summaries = pd_summaries[~pd_summaries['query'].
                                        isin(depracated_queries)]
            pd_failed = pd_failed[~pd_failed['query'].isin(depracated_queries)]

        log.info('Successfully got wikipedia summaries for %d queries',
                 pd_summaries.shape[0])
        log.info('Failed to get wikipedia summaries for %d queries',
                 pd_failed.shape[0])

        return pd_summaries, pd_failed
Exemplo n.º 17
0
def wiki_summary(text):
    text = text + ' wikipedia'
    search_results = google_search(text)

    wiki_list = search_results[0].split('/')
    search_query = wiki_list[-1].replace('_', ' ')

    summary = (wikipedia.WikipediaPage(title=search_query).summary)
    print(summary)
    return (summary)
Exemplo n.º 18
0
def main():
    tupledict, tupletotal = get_tuples(sys.argv[1], int(sys.argv[2]))
    page = sys.argv[3]
    text = wikipedia.WikipediaPage(page).content.encode('ascii', 'xmlcharrefreplace')
    score = 0.0
    for tupletext in tupledict:
        if tupletext in text:
            score = score + tupledict[tupletext]
    score = (score / tupletotal)
    print "score=%f page=%s" % (score, page)
Exemplo n.º 19
0
def suggest(request):
    title = request.GET['q']
    result = wiki.WikipediaPage(title=title).html()
    return render(
        request,
        template_name="wikipage.html",
        context={"article": {
            "title": title,
            "html_content": result
        }})
    def get_links(self, wiki_page):

        if wiki_page.title() in self.tested_pages:
            raise IOError('Link already found')

        self.tested_pages.add(wiki_page.title())

        wiki_links = wikipedia.WikipediaPage(wiki_page.title()).links

        return wiki_links
def scrape_data(topics):
    for topic in topics:
        topic_to_noun[topic] = {}
        try:
            content = wikipedia.WikipediaPage(topic).content
        except wikipedia.exceptions.DisambiguationError as e:
            print("Error: {0}".format(e))
        extract_nouns(content, topic)
        try:
            external_links = wikipedia.WikipediaPage(topic).links
        except wikipedia.exceptions.DisambiguationError as e:
            print("Error: {0}".format(e))
        for links in external_links:  #follow the links and extract content
            try:
                link_content = wikipedia.WikipediaPage(links).content
            except wikipedia.exceptions.DisambiguationError as e:
                print("Error: {0}".format(e))
            extract_nouns(link_content, topic)
        print("--- Finished scraping topic! " + str(topic) + " ----")
Exemplo n.º 22
0
def get_historic_events():
    date = datetime.now()
    date = date.strftime('%B') + ' ' + str(date.day)
    page = wikipedia.WikipediaPage(date)
    soup = BeautifulSoup(page.html(), features="html.parser")
    events = soup.find("span", id="Events").find_next("ul").find_all("li")
    events = [i.text for i in events]
    events = sample([i for i in events if len(i) < 108],
                    3)  # limit event text length bc/ screen size
    return jsonify({'events': events}), 200
def generate_dictionary(tag, max_word_length):

    wiki.set_lang(tag)
    for topic in config_2.language_tags[tag]:
        page = wiki.WikipediaPage(topic)
        content = page.content
        content = unidecode(content)
        lst = process(content, max_word_length)

    return lst
Exemplo n.º 24
0
 def makeSuits(self):
     urls = wikipedia.WikipediaPage('Minor Arcana').images
     return [self.makeName(list(zip([url for url in urls if url.rfind(
         'Wands') > 0], names)), 'Wands'),
         self.makeName(list(zip([url for url in urls if url.rfind(
             'Pents') > 0], names)), 'Coins'),
         self.makeName(list(zip([url for url in urls if url.rfind(
             'Cups') > 0], names)), 'Cups'),
         self.makeName(list(zip([url for url in urls if url.rfind(
             'Swords') > 0], names)), 'Swords')]
Exemplo n.º 25
0
 def viki_wikifull_page(self, query=None):
     try:
         data = wikipedia.WikipediaPage(title=query,
                                        pageid=None,
                                        redirect=True,
                                        preload=False,
                                        original_title=u'')
         print data.images
     except Exception as e:
         return json.dumps({'error': str(e)})
Exemplo n.º 26
0
 def auto_name(name):
     try:
         for i in wikipedia.search(name, results=3):
             try:
                 wikipedia.WikipediaPage(i)
                 return i
             except:
                 pass
     except:
         return name
def get_paragraphs(question: str, answers: list):
    search_results = wikipedia.search(question, results=10)
    if (search_results is None):
        print("couldn't find results!")
        return None
    page_list = [
        wikipedia.WikipediaPage(title=target) for target in search_results
    ]
    content_list = [page.content for page in page_list]
    return get_five_paragraphs(content_list, answers)
Exemplo n.º 28
0
def getArticle(articleName):
    try:
        articlePage = wikipedia.WikipediaPage(articleName)

    except wikipedia.exceptions.DisambiguationError as e:
        print('\nDisambiguation Error: Article name is too broad')

    except wikipedia.exceptions.PageError as e:
        print('Article page not found, try a different name')

    return articlePage
    def get_wiki_image_parser(self, search_term, ordinal_num):

        result = wikipedia.search(search_term, results=1)
        wikipedia.set_lang("en")
        wkpage = wikipedia.WikipediaPage(title=result[0])

        for image in wkpage.images:
            if ordinal_num in image:
                return image

        return ""
Exemplo n.º 30
0
def lookup_state_wiki(state_name):
    '''
    Finds wikipedia page for given state_name
    Converts html to text
    '''
    page_html = wikipedia.WikipediaPage(state_name).html()
    if page_html:
        soup = BeautifulSoup(page_html, "html.parser")
        return soup.get_text()
    else:
        print("State not found")