Exemplo n.º 1
0
def page(title=None,
         pageid=None,
         auto_suggest=True,
         redirect=True,
         preload=False):
    '''
    Get a WikipediaPage object for the page with title `title` or the pageid
    `pageid` (mutually exclusive).

    Keyword arguments:

    * title - the title of the page to load
    * pageid - the numeric pageid of the page to load
    * auto_suggest - let Wikipedia find a valid page title for the query
    * redirect - allow redirection without raising RedirectError
    * preload - load content, summary, images, references, and links during initialization
    '''

    if title is not None:
        if auto_suggest:
            results, suggestion = search(title, results=1, suggestion=True)
            try:
                title = suggestion or results[0]
            except IndexError:
                # if there is no suggestion or search results, the page doesn't
                # exist
                raise PageError(title)
        return WikipediaPage(title, redirect=redirect, preload=preload)
    elif pageid is not None:
        return WikipediaPage(pageid=pageid, preload=preload)
    else:
        raise ValueError("Either a title or a pageid must be specified")
def build_wiki_category_dataset():
    readfile = codecs.open(
        '/export/home/Dataset/wikipedia/parsed_output/tokenized_wiki/tokenized_wiki.txt',
        'r', 'utf-8')
    writefile = codecs.open(
        '/export/home/Dataset/wikipedia/parsed_output/tokenized_wiki/tokenized_wiki2categories.txt',
        'w', 'utf-8')
    co = 0
    for line in readfile:
        try:
            line_dic = json.loads(line)
        except ValueError:
            continue

        try:
            # title = line_dic.get('title')
            title_id = line_dic.get('id')
            article = WikipediaPage(pageid=title_id)
        except AttributeError:
            continue
        type_list = article.categories
        # print(type_list)
        line_dic['categories'] = type_list
        writefile.write(json.dumps(line_dic) + '\n')
        co += 1
        if co % 5 == 0:
            print(co)
        if co == 100000:
            break
    writefile.close()
    readfile.close()
    print('over')
Exemplo n.º 3
0
def read_from_wiki(titles):

    """ This function will except a list(list of Strings) of all the titles
    and we will use these strings to get test from wikipedia."""

    out_data_list = []                                                                   # List to append the dictionary elements(i.e. Required data with keys and values.) into one list.

    for index, title in enumerate(titles):
        out_data_dict = {'Title': title ,'Passage':'', "Question": [] ,"Error" : None }  # Will store our processed text into dictionary. {key:'Passage', value:'Text'}

        try:
            get_wiki_data = WikipediaPage(title = title)                                 # Get all the data from wikipedia.

        except DisambiguationError:
            # If there is any disambiguity in the Title name.
            out_data_dict["Error"] = ("There is Disambigity in the title : " + title + ". Please provide more precise title.")

        except PageError:
            # If no page found with the given title.
            out_data_dict["Error"] = ("Page id " + title + " does not match any pages. Try another id!")

        if not out_data_dict["Error"]:
            # If there is no error then store the passage.
            content_only = get_wiki_data.content             # Store main content into a variable.
            processed_text = normalize_passage(content_only) # Process text using normalize_passge().
            out_data_dict['Passage'] = processed_text        # Store received text into dictionary.
            out_data_list.append(out_data_dict)              # Now append each dictionary into List.

    return out_data_list
Exemplo n.º 4
0
def main():
  nlp = spacy.load("en_core_web_sm")
  with open('test.txt', 'r') as file:
    data = file.read().replace('\n','')
	
  doc = nlp(data)
  wiki = WikipediaPage("Indiana Jones and the Raiders of the Lost Ark")

  map = dict()
	
  for sent in doc.sents:
    for token in sent:
      if (token.is_alpha and not token.is_stop):
        s = getName(token)
        if (s = None):
          s = token.lemma_
        if (token.pos == PROPN):
		    flag = True
		    for name in names:
			if (s in name):
			    flag = False
			    s = name
                            break
		    if (flag):
			names.append(s)
		if (s in map):
		    map[s] += 1
		else:
                    map[s] = 1

	print("---------------------------------------")

	for k in map:
		if(map[k] > 1):
			print(k, map[k])

	print("---------------------------------------")
	print(names)
	
	print("---------------------------------------")
	
	for sent in doc.sents:
		root = sent.root
		line = ""
		for token in root.lefts:
			s = func(token)
			print(s)
			if(s != None and s in map and map[s] > 1):
				line += " " + s
		line += " " + root.lemma_
		for token in root.rights:
			s = token.lemma_
			if (token.pos == PROPN):
				s = str(token)
			if (s in map and map[s] > 1):
				line += " " + s
		print(line)
Exemplo n.º 5
0
def process_page(params):
    id_link, link, depth = params
    # Get the wiki object for the link
    try:
        wiki_obj = WikipediaPage(link)
        links = wiki_obj.links
    except (PageError, KeyError, DisambiguationError):
        return 0, (id_link, None, None, link, None)
    title = wiki_obj.title
    summary = wiki_obj.summary[:300].replace("\"", "'")
    return 1, (id_link, summary, depth + 1, title, links)
Exemplo n.º 6
0
    def get_articles(self, graph):

        category = WikipediaPage(self.__primarylabel__ + ':' +
                                 self.property_title)

        for page in category.links:
            article = Article(page, depth=self.property_depth + 1)
            graph.push(article)

            self.has_article.add(article)
            graph.push(self)
Exemplo n.º 7
0
    def get_categories(self, graph):

        category = WikipediaPage(self.__primarylabel__ + ':' +
                                 self.property_title)

        for cat in category.categories:
            category = Category(cat, depth=self.property_depth + 1)
            graph.push(category)

            self.has_category.add(category)
            graph.push(self)
def clever_life(response: wikipedia.WikipediaPage, quick_table=None):
    res = {}

    if quick_table is None:
        quick_table = get_quick_table(response.html())
    birthday = -1
    death_day = -1
    age = -1
    alive = True
    precision = True

    birthday_words = {
        "Дата рождения", "Рождение", "Родился", "Родилась", "Рождён"
    }
    death_words = {"Дата смерти", "Смерть", "Умер", "Умерла", "Убит"}

    if "Дата рождения" in quick_table:
        raw_birthday = find_4_digit_nums(quick_table["Дата рождения"])
        if raw_birthday:
            birthday = raw_birthday[0]
        if raw_birthday and "Дата смерти" in quick_table:
            death_day = find_4_digit_nums(quick_table["Дата смерти"])[0]
            alive = False
            age = death_day - birthday
        elif raw_birthday:
            alive = True
            age = 2020 - birthday

    if birthday == -1:
        precision = False
        summary = response.summary
        bad_data = get_life_time(summary)
        if "born" in bad_data:
            birthday = bad_data["born"]
            if "end" in bad_data:
                death_day = bad_data["end"]
                alive = False
            else:
                death_day = 2020
                alive = True
        else:
            birthday = random.randint(1900, 2020)
            death_day = random.randint(birthday, 2500)
            alive = death_day > 2020
        age = death_day - birthday
    res = {
        "alive": alive,
        "birth_day": birthday,
        "death_day": death_day,
        "age": age,
        "precision": precision
    }
    return res
Exemplo n.º 9
0
    def handle_noargs(self, **options):
        DOCTORS_LIST = [
            {'Number': 1, 'First': 'William', 'Second': 'Hartnell',
             'Start_Date': '1963-11-23', 'End_Date': '1966-10-29', 'Slug': 'First_Doctor'},
            {'Number': 2, 'First': 'Patrick', 'Second': 'Troughton',
             'Start_Date': '1966-10-29', 'End_Date': '1969-07-21', 'Slug': 'Second_Doctor'},
            {'Number': 3, 'First': 'Jon', 'Second': 'Pertwee',
             'Start_Date': '1970-01-03', 'End_Date': '1974-07-08', 'Slug': 'Third_Doctor'},
            {'Number': 4, 'First': 'Tom', 'Second': 'Baker',
             'Start_Date': '1981-03-08', 'End_Date': '1981-03-21', 'Slug': 'Fourth_Doctor'},
            {'Number': 5, 'First': 'Peter', 'Second': 'Davison',
             'Start_Date': '1981-03-21', 'End_Date': '1984-03-16', 'Slug': 'Fifth_Doctor'},
            {'Number': 6, 'First': 'Colin', 'Second': 'Baker',
             'Start_Date': '1984-03-16', 'End_Date': '1986-12-06', 'Slug': 'Sixth_Doctor'},
            {'Number': 7, 'First': 'Sylvester', 'Second': 'McCoy',
             'Start_Date': '1987-09-07', 'End_Date': '1996-05-27', 'Slug': 'Seventh_Doctor'},
            {'Number': 8, 'First': 'Paul', 'Second': 'McGann',
             'Start_Date': '1996-05-27', 'End_Date': '1996-05-27', 'Slug': 'Eighth_Doctor'},
            {'Number': 9, 'First': 'Christopher', 'Second': 'Eccleston',
             'Start_Date': '2005-03-26', 'End_Date': '2005-07-18', 'Slug': 'Ninth_Doctor'},
            {'Number': 10, 'First': 'David', 'Second': 'Tennant',
             'Start_Date': '2005-07-18', 'End_Date': '2010-01-01', 'Slug': 'Tenth_Doctor'},
            {'Number': 11, 'First': 'Matt', 'Second': 'Smith',
             'Start_Date': '2010-01-01', 'End_Date': '2013-12-25', 'Slug': 'Eleventh_Doctor'},
            {'Number': 12, 'First': 'Peter', 'Second': 'Capaldi',
             'Start_Date': '2013-12-25', 'End_Date': datetime.datetime.now(), 'Slug': 'Twelfth_Doctor'},
        ]

        for item in DOCTORS_LIST:
            slug = item["Slug"]
            page = WikipediaPage(slug)

            doctor = Doctor.objects.create(
                number=item["Number"],
                first_name=item["First"],
                second_name=item["Second"],
                character_bio=page.section("Character biography"),
                personality=page.section("Personality"),
                appearance=page.section("Appearance"),
                story_style=page.section("Story style"),
                later_appearances=page.section("Later appearances"),
                other_mentions=page.section("Other mentions"),
                reception=page.section("Reception"),
                short_desc="",
                about=page.summary,
                # doc_img=img,
                start_date=item["Start_Date"],
                end_date=item["End_Date"],
                slug=slug)
            doctor.save()
Exemplo n.º 10
0
    def get_categories(self):

        article = WikipediaPage(self.title)

        for category in article.categories:
            title = prefix + category
            category = WikiNode(title, self.graph, label="Category")
            category.depth = self.depth + 1

            self.graph.create(category)
            self.graph.push(category)

            self.has_category.add(category)
            self.graph.push(self)
Exemplo n.º 11
0
    def get_plot(self):
        """gets the plot"""
        # This is in case 'Plot' is not found
        plot_names = ['Plot', 'Summary', 'Premise', 'Synopsis']
        plot_section = None
        self.full_plot_section = None
        for i in plot_names:
            try:
                plot_section = WikipediaPage(self.title).section(i)

                # saves this to use in get_keywords
                self.full_plot_section = plot_section

                # this is a brief SUMMARY, not the whole movie
                plot_section = plot_section[:1000] + "..."
                if plot_section is not None:
                    plot_section = plot_section.replace('\n',
                                                        ' ')  # remove newlines
                    return plot_section
            except TypeError:
                continue
        if plot_section is None:
            plot_section = 'none'
            return plot_section
Exemplo n.º 12
0
def process_co(message):
    try:
        co_msg = str(message.text)
        set_lang("en")
        lat, lon = WikipediaPage(co_msg).coordinates
        bot.send_message(
            chat_id=message.chat.id, text=str(round(lat, 5)) + ", " + str(round(lon, 5))
        )
        bot.send_location(
            chat_id=message.chat.id,
            latitude=lat,
            longitude=lon,
            reply_markup=main_keyboard(),
        )
    except Exception:
        bot.send_message(
            chat_id=message.chat.id,
            text="Not a location.",
            reply_markup=main_keyboard(),
        )
Exemplo n.º 13
0
    def get_html_for_page(self, page: wikipedia.WikipediaPage):
        """Construct HTML for page

        TODO: Fix HTML links so that images and external links work correctly.
        For example, current wiki links are `href="/wiki/Mount_Lago"`, and that
        should be changed to `href="wikipedia.org/wiki/Mount_Lago"`. Note that
        it would be cool if you were able to check if the page pointed to is
        also in the Wikipedia extract, because then you could link to it
        offline!?!?
        """
        if not isinstance(page, wikipedia.WikipediaPage):
            raise TypeError('page must be of type wikipedia.WikipediaPage')

        html = page.html()

        # Download images
        image_paths = self._download_images(page.images)

        # Fix links
        html = self._fix_links(html, image_paths)

        return html
Exemplo n.º 14
0
    def get_rt_score(self):
        """gets Rotten Tomatoes score"""
        full_page = WikipediaPage(self.title).content
        sentences = nltk.sent_tokenize(full_page)
        rt_score = [
            x for x in sentences if "Rotten Tomatoes" in x and "%" in x
        ]

        # for films without a Rotten Tomatoes score (ex. La Jetee)
        if rt_score:
            rt_score = rt_score[0]
        else:
            rt_score = 'none'

        # To clean it up and remove everything before the last newline char:
        rt_score = rt_score.strip()  # remove trailing \n
        nl_indx = 0  # counter for the last newline character
        for i in range(len(rt_score)):
            if rt_score[i] == '\n':
                nl_indx = i
        if nl_indx == 0:
            nl_indx = -1  # exception
        rt_score = rt_score[nl_indx + 1:]
        return rt_score
Exemplo n.º 15
0
def findSummaries(s):
    global wikiFlag
    global wikiChars
    global nameSummaries
    global soupChars
    sum = ""
    try:
        if wikiFlag:
            s1 = uniqueMatchAgainstLastName(s)
            if s != s1:
                nicknames[s] = s1
            s = s1
        wiki = WikipediaPage(s)
        if "characters" in wiki.title:
            # name brought up a list of characters wiki page
            if not wikiFlag:
                # this is the first time we've found this page
                wikiFlag = True
                wikiChars = wiki
                soupChars = BeautifulSoup(wikiChars.html(),
                                          'html.parser').find_all(
                                              'span', class_='mw-headline')
                catchUpOnSummaries()
            summary = getSomeText(wiki.section(uniqueMatchAgainstLastName(s)))
            if summary != None:
                # the name is a section, so assign that summary
                nameSummaries[s] = summary
                return
            s2 = uniqueMatchNameAgainstFirstName(s)
            if s2 != s:
                # the name (uniquely) matches to a first name, so grab that summary
                summary = getSomeText(wiki.section(s2))
                if summary != None:
                    nameSummaries[s] = summary
                    return
            s3 = uniqueMatchNameAgainstLastName(s.split(' ')[-1])
            if s3 != s:
                nameSummaries[s] = getSomeText(wiki.section(s3))
            else:
                nameSummaries[s] = None
            return
        else:
            nameSummaries[s] = getSomeText(wiki.summary)
            findNicknames(s, wiki)
            return
    except wikipedia.exceptions.DisambiguationError as e:
        if not wikiFlag:
            namesToTry.append(s)
            return
        if verbose: print("-e1", s)  #
        # name matched by last name is causing wiki ambiguation
        # try matching by first name
        s2 = uniqueMatchAgainstFirstName(s)
        if s2 != s:
            if verbose: print("-e2", s2)
            nicknames[s] = s2
            findSummaries(s2)
            return
        # first name did not uniquely match, so try just s's last name
        # and matching against another last name
        s3 = uniqueMatchAgainstLastName(s.split(' ')[-1])
        if s3 != s:
            if verbose: print("-e3", s3)  #
            nicknames[s] = s3
            findSummaries(s3)
            return
        # nothing is matching, so just take the first entry on wikipedia
        if verbose: print("-ed", e.options[0])  #
        nicknames[s] = e.options[0]
        findSummaries(e.options[0])
        return
    except:
        if not wikiFlag:
            namesToTry.append(s)
            return
        if verbose: print("-m1", s)  #
        summary = getSomeText(wikiChars.section(s))
        if summary != None:
            nameSummaries[s] = summary
            return
        s2 = uniqueMatchAgainstFirstName(s)
        if s2 != s:
            nicknames[s] = s2
            if verbose: print("-m2", s2)  #
            summary = getSomeText(wikiChars.section(s2))
            if summary != None:
                nameSummaries[s] = summary
                return
        s3 = uniqueMatchAgainstLastName(s.split(' ')[-1])
        if s3 != s:
            nicknames[s] = s3
            if verbose: print("-m3", s3)  #
            summary = getSomeText(wikiChars.section(s3))
            if summary != None:
                nameSummaries[s] = summary
                return
    # Everything failed, description of this must be complex to find
    if verbose: print("None", s)
    nameSummaries[s] = None
Exemplo n.º 16
0
import wikipedia
from wikipedia import WikipediaPage

wiki = WikipediaPage("Indiana Jones and the Raiders of the Lost Ark")

print("Ark" in wiki.links)
Exemplo n.º 17
0
def send_summary():  # for Nick's microservice
    """exports the full wikipedia page"""

    user_input = request.args.get("article")
    full_page_2 = WikipediaPage(user_input).summary
    return str(full_page_2)
Exemplo n.º 18
0
         ("Applied Mathematics", "Statistics"),
         ("Applied Mathematics", "Game theory"),
         ("Applied Mathematics", "Information theory"),
         ("Applied Mathematics", "Computer science"),
         ("Applied Mathematics", "Theory of computation"),
         ("Applied Mathematics", "Control theory"), ("Others", "Order theory"),
         ("Others", "Graph theory")]

# Step 1. Get pages (and raw equations) from wikipedia

for pair in pages:
    domain = pair[0]
    method = pair[1]
    if method not in results:

        result = WikipediaPage(method)

        # Show a visual check!
        print("Matching %s to %s" % (method, result.title))
        entry = {
            'categories': result.categories,
            'title': result.title,
            'method': method,
            'url': result.url,
            'summary': result.summary,
            'images': result.images
        }

        # We can use links to calculate relatedness
        entry['links'] = get_attribute(result, 'links')
        entry['references'] = get_attribute(result, 'references')