Python WikiApi.find 예제들, wikiapi.WikiApi.find Python 예제들

예제 #1

0

파일 보기

파일: test_wikiapi.py 프로젝트: richardasaurus/wiki-api

    def test_cache_not_populated_when_disabled(self):
        wiki = WikiApi({'cache': False})

        assert self._get_cache_size(wiki) == 0
        wiki.find('Bob Marley')
        assert self._get_cache_size(wiki) == 0
        shutil.rmtree(wiki.cache_dir, ignore_errors=True)

예제 #2

0

파일 보기

파일: test_wikiapi.py 프로젝트: richardasaurus/wiki-api

    def test_cache_populated(self):
        wiki = WikiApi({'cache': True, 'cache_dir': '/tmp/wikiapi-test'})

        assert self._get_cache_size(wiki) == 0
        # Make multiple calls to ensure no duplicate cache items created
        assert wiki.find('Bob Marley') == wiki.find('Bob Marley')
        assert self._get_cache_size(wiki) == 1

        # Check cache keys are unique
        assert wiki.find('Tom Hanks') != wiki.find('Bob Marley')

        assert self._get_cache_size(wiki) == 2
        shutil.rmtree(wiki.cache_dir, ignore_errors=True)

예제 #3

0

파일 보기

파일: test_wikiapi.py 프로젝트: richardasaurus/wiki-api

 def set_up(self):
     # using an Italian-Emilian locale that is full of unicode symbols
     wiki = WikiApi({'locale': 'eml'})
     result = wiki.find('Bulaggna')[0]
     return {
         'wiki': wiki,
         'result': result,
     }

예제 #4

0

파일 보기

파일: test_wikiapi.py 프로젝트: richardasaurus/wiki-api

 def set_up(self):
     wiki = WikiApi()
     results = wiki.find('Bill Clinton')
     article = wiki.get_article(results[0])
     return {
         'wiki': wiki,
         'results': results,
         'article': article,
     }

예제 #5

0

파일 보기

파일: wikisearch.py 프로젝트: 070794/jedi_school3_batch2

def get_url(query, log_file):
  wiki = WikiApi()
  results = wiki.find(query)
  if len(results) == 0:
    sys.stderr.write("No wikipedia article found for '" + query + "'\n")
  else:
    article = wiki.get_article(results[0])
    print article.url
    with open(log_file, 'a') as f:
      f.write(article.url + "\n")

예제 #6

0

파일 보기

파일: WikipediaBase.py 프로젝트: Willifme/Disce

    def wikiqueryresults(searchQuery):

        wiki = WikiApi({})

        wiki = WikiApi({ 'locale' : 'en' }) # Top specify your locale, 'en' is default

        wikiSearch = wiki.find(searchQuery)

        wikiArticle = wiki.get_article(wikiSearch[0])

        return wikiArticle.summary

예제 #7

0

파일 보기

파일: wikipedia.py 프로젝트: HarrisonGregg/Specialization

def wiki_api(options):
	wiki = WikiApi()
	wiki = WikiApi({ 'locale' : 'en'}) # to specify your locale, 'en' is default
	results = wiki.find(options['q'])
	for result in results:
		article = wiki.get_article(results)
		title = article.heading
		url = article.url

		print(url)
		link = Link(topic = options['topic'], title = title, url = url)
		link.save()

예제 #8

0

파일 보기

파일: tests.py 프로젝트: tedpark/wiki-api

class TestUnicode(unittest.TestCase):
    def setUp(self):
        # using an Italian-Emilian locale that is full of unicode symbols
        self.wiki = WikiApi({'locale': 'eml'})
        self.res = self.wiki.find('Bulagna')[0]
        self.article = None

    def test_search(self):
        # this is urlencoded.
        self.assertEqual(self.res, u'Bul%C3%A5ggna')

    def test_article(self):
        #unicode errors will likely blow in your face here
        self.assertIsNotNone(self.wiki.get_article(self.res))

예제 #9

0

파일 보기

파일: test_wikiapi.py 프로젝트: richardARPANET/wiki-api

class TestUnicode:
    @pytest.fixture(autouse=True)
    def set_up(self):
        # using an Italian-Emilian locale that is full of unicode symbols
        self.wiki = WikiApi({'locale': 'eml'})
        self.result = self.wiki.find('Bulaggna')[0]

    def test_search(self):
        # this is urlencoded.
        assert self.result == 'Bul%C3%A5ggna'

    def test_article(self):
        # unicode errors will likely blow in your face here
        assert self.wiki.get_article(self.result) is not None

예제 #10

0

파일 보기

파일: wikisearch.py 프로젝트: illuminati-RV/hazel-linux_chatbot_and_assistant

def wiki(tokens, message):

        print("\033[1;34;1m")        
        print("\nHazel : Please wait while I surf the web for a result")
        try:

            wiki = WikiApi()
            WikiApi({'locale': 'en'})
            if "search" in tokens:
                tokens.remove("search") # remove search keyword to retrieve the main content to be searched
            if "what" in tokens:
                tokens.remove("what")
            if "who" in tokens:
                tokens.remove("who")
            if "look" in tokens:
                tokens.remove("look")
            if "tell" in tokens:
                tokens.remove("tell")
            if "more" in tokens:
                tokens.remove("more")
            if "about" in tokens:
                tokens.remove("about")
            stop_words = set(stopwords.words('english')) # Remove stop words
            filtered_sentence = [w for w in tokens if not w in stop_words]
            filtered_sentence = []
            for w in tokens: # Filtering input by removing stopwords such as 'I', 'for', 'is', etc.
                if w not in stop_words:
                    filtered_sentence.append(w) # Get and store message without stopeords

            filtered_list = filtered_sentence
            filtered_sentence = ' '.join(filtered_list) # Making a sentance out of the tokens

            message = filtered_sentence # storing input in message
            tokens = word_tokenize(message) # tokenize new message
     
            s = "" # appends the remaining tokens to be searched for
            for i in tokens:
                s = s + i + " " # appending the tokens to form a search keyword
            results = wiki.find(s) # package function to do online searched
            #print("websearch\n"
            if results == "":
                results = "null"
            print("\nFound result for : ", results[0]) # print the first search result
            print("\033[1;37;1m") # set console color
            print(wi.summary(s))
            #main()
        except Exception as e:
            print("I didnt get that. You may want to try that again")

예제 #11

0

파일 보기

def wiki_search(query):
    wiki = WikiApi()
    wikiurls = []
    lst = query.split(",")
    num = 10 / len(lst)
    #	print num
    for i in lst:
        results = wiki.find(i)
        cnt = 0
        for j in results:
            cnt = cnt + 1
            article = wiki.get_article(j)
            wikiurls.append(article.url)
            if cnt == num:
                break
    return wikiurls

예제 #12

0

파일 보기

파일: wikisearch.py 프로젝트: deepjahan/set-expansion

def wiki_search(query):
	wiki = WikiApi()
	wikiurls=[]
	lst=query.split(",")
	num = 10/len(lst)
#	print num
	for i in lst:
		results = wiki.find(i)
		cnt=0
		for j in results:
			cnt=cnt+1
			article = wiki.get_article(j)
			wikiurls.append(article.url)
			if cnt==num:
				break
	return wikiurls

예제 #13

0

파일 보기

def get_wiki_phrases(word):
    wiki = WikiApi()
    wiki = WikiApi({'locale': 'en'})
    results = wiki.find(word)
    print results
    phrase = ""
    for i in range(min(4, len(results))):
        article = wiki.get_article(results[i])
        #print article.content
        phrase = phrase + " " + article.content
        #print phrase
    rake_object = rake.Rake("SmartStoplist.txt", 4, 3, 10)

    #Now, we have a RAKE object that extracts keywords where:
    #   Each word has at least 4 characters
    #   Each phrase has at most 3 words
    #   Each keyword appears in the text at least 4 times
    keywords = rake_object.run(phrase)
    return keywords[0:20]

예제 #14

0

파일 보기

파일: std_out.py 프로젝트: CherylZR/face_id2

def get_full_name_from_wiki(name):
    wiki = WikiApi()
    results = wiki.find(name)
    if len(results) > 0:
        article = wiki.get_article(results[0])
        new_name = article.summary
        new_name = new_name[:new_name.find('(')-1]
        if new_name.find(' refer ') != -1:
            if len(results) > 1:
                article = wiki.get_article(results[1])
                new_name = article.summary
                new_name = new_name[:new_name.find('(') - 1]
            else:
                return None
        table = str.maketrans({key: None for key in string.punctuation + '\r\n'})
        new_name = new_name.translate(table)
        if len(new_name) > 4 and len(new_name) < 50:
            return new_name
        else:
            return None
    else:
        return None

예제 #15

0

파일 보기

파일: wikipedia.py 프로젝트: manishc1/DySeCor

def get_security_results(filenames):
    """
    Pre-fill visited with security term results.
    """
    global visited_results

    wiki = WikiApi({})

    phrases = []
    for filename in filenames:
        lines = readLines(filename)
        for line in lines:
            line = line.strip()
            if ((len(line) > 0) and (line[0] != '#')):
                if (line[0] == '/'):
                    phrases.append(line.split(' ', 1)[1])
                else:
                    phrases.append(line)

    for phrase in phrases:
        results = wiki.find(phrase)
        for result in results:
            if (result not in visited_results):
                visited_results.append(result)

예제 #16

0

파일 보기

파일: wikipedia.py 프로젝트: manishc1/MGSeCor

class Wikipedia_Scanner(object):
	"""
	Class to Scann wikipedia articles.
	"""

	def __init__(self, add_gloss_list, del_gloss_list, category, label):
		"""
		Initialize the class.
		"""
		self.add_phrases = get_phrases(add_gloss_list)
		self.del_phrases = get_phrases(del_gloss_list)
		self.category = category
		self.corpus_dir = CORPUS_DIR + '/' + label + '/wikipedia/' + category
		self.raw_dir = RAW_DATA_DIR + '/' + label + '/wikipedia/' + category
		self.wiki = WikiApi({})
		self.visited_results = self.get_results(self.del_phrases)
		self.count = 0


	def get_results(self, phrases):
		"""
		Return dictionary of wiki results corresponding to phrases.
		"""
		visited_results = {}
		for phrase in phrases:
			results = self.wiki.find(phrase)
			for result in results:
				if (not visited_results.has_key(result)):
					visited_results[result] = True
		return visited_results


	def get_articles(self):
		"""
		Fetches articles and puts in data directory.
		"""
		for phrase in self.add_phrases:
			try:
				results = self.wiki.find(phrase)
				for result in results:
					if (not self.visited_results.has_key(result)):
						self.visited_results[result] = True

						article = self.wiki.get_article(result)
						entry_src = 'wikipedia_' + self.category
						entry_type = 'article'
						entry_id = 'wikipedia_' + result.replace(' ', '_').replace('/', '_')
						entry_title = article.heading
						entry_date = datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')
						entry_desc = clean(article.summary)

						if (''.join(entry_desc.split()) != ''):
							xml_string = bundle_xml(entry_src, entry_type, entry_id, entry_title, entry_date, entry_desc)
			
							write_string(self.corpus_dir + '/' + entry_id.lower() + '.xml', xml_string, False)
							write_string(self.raw_dir + '/' + entry_id.lower() + '.txt', entry_desc, False)
						
							self.count = self.count + 1
							if (self.count % 100 == 0):
								print 'Scanned ' + str(self.count) + ' wiki articles.'

			except Exception as e:
				print 'Wiki Api Error! [' + str(e) + ']'

예제 #17

0

파일 보기

파일: wikipedia.py 프로젝트: manishc1/MGSeCor

class Wikipedia_Scanner(object):
    """
	Class to Scann wikipedia articles.
	"""
    def __init__(self, add_gloss_list, del_gloss_list, category, label):
        """
		Initialize the class.
		"""
        self.add_phrases = get_phrases(add_gloss_list)
        self.del_phrases = get_phrases(del_gloss_list)
        self.category = category
        self.corpus_dir = CORPUS_DIR + '/' + label + '/wikipedia/' + category
        self.raw_dir = RAW_DATA_DIR + '/' + label + '/wikipedia/' + category
        self.wiki = WikiApi({})
        self.visited_results = self.get_results(self.del_phrases)
        self.count = 0

    def get_results(self, phrases):
        """
		Return dictionary of wiki results corresponding to phrases.
		"""
        visited_results = {}
        for phrase in phrases:
            results = self.wiki.find(phrase)
            for result in results:
                if (not visited_results.has_key(result)):
                    visited_results[result] = True
        return visited_results

    def get_articles(self):
        """
		Fetches articles and puts in data directory.
		"""
        for phrase in self.add_phrases:
            try:
                results = self.wiki.find(phrase)
                for result in results:
                    if (not self.visited_results.has_key(result)):
                        self.visited_results[result] = True

                        article = self.wiki.get_article(result)
                        entry_src = 'wikipedia_' + self.category
                        entry_type = 'article'
                        entry_id = 'wikipedia_' + result.replace(
                            ' ', '_').replace('/', '_')
                        entry_title = article.heading
                        entry_date = datetime.now().strftime(
                            '%Y-%m-%d_%H-%M-%S-%f')
                        entry_desc = clean(article.summary)

                        if (''.join(entry_desc.split()) != ''):
                            xml_string = bundle_xml(entry_src, entry_type,
                                                    entry_id, entry_title,
                                                    entry_date, entry_desc)

                            write_string(
                                self.corpus_dir + '/' + entry_id.lower() +
                                '.xml', xml_string, False)
                            write_string(
                                self.raw_dir + '/' + entry_id.lower() + '.txt',
                                entry_desc, False)

                            self.count = self.count + 1
                            if (self.count % 100 == 0):
                                print 'Scanned ' + str(
                                    self.count) + ' wiki articles.'

            except Exception as e:
                print 'Wiki Api Error! [' + str(e) + ']'

예제 #18

0

파일 보기

print(grouped)

# "calculate" genders from Wikipedia articles
gender = []
seen = {}  # memoization: author -> gender
wiki = WikiApi()
for author in grouped.column("AUTHOR"):
    if author.lower() in seen:
        print(author, "already found previously")
        gender.append(seen[author.lower()])
        continue

    try:
        try:
            print("trying to find " + author + " in wikipedia")
            results = wiki.find(author)
            wikipedia_page = wiki.get_article(results[0]).url

        except Exception:
            # errors when article is not found; use google search
            # instead we try to limit number of google search queries
            # because google limits them for free accounts or something
            print("trying to find " + author + " in google")
            wikipedia_page = google_search(author + ' site: en.wikipedia.org',
                                           num=1)[0]['link']

        g = find_gender(wikipedia_page)

    except Exception:
        # TODO: Possibly search on google for the book title and author if still
        # not found, and find some other site that has pronouns on it, if there

예제 #19

0

파일 보기

from wikiapi import WikiApi
wiki = WikiApi()
wiki = WikiApi({'locale': 'es'})  # to specify your locale, 'en' is default

wiki.options

results = wiki.find('hereditary myopathies')
print()

예제 #20

0

파일 보기

파일: tests.py 프로젝트: tedpark/wiki-api

# -*- coding: utf-8 -*-
from wikiapi import WikiApi
import unittest

wiki = WikiApi({})
results = wiki.find('Bill Clinton')
article = wiki.get_article(results[0])  # taking first search result


class TestWiki(unittest.TestCase):
    def test_heading(self):
        self.assertIsNotNone(article.heading)

    def test_image(self):
        self.assertTrue(isinstance(article.image, str))

    def test_summary(self):
        self.assertGreater(len(article.summary), 100)

    def test_content(self):
        self.assertGreater(len(article.content), 200)

    def test_references(self):
        self.assertTrue(isinstance(article.references, list))

    def test_get_relevant_article(self):
        keywords = ['president', 'hilary']
        _article = wiki.get_relevant_article(results, keywords)
        self.assertTrue('Bill Clinton' in _article.heading)

    def test_get_relevant_article_no_result(self):

예제 #21

0

파일 보기

 def wiki_search(self, text):
     wiki = WikiApi()
     results = wiki.find(text)
     article = wiki.get_article(results[0])
     return article

예제 #22

0

파일 보기

def jarvis(data):
    first = data.split(" ")
    if first[0] == "locate" or first[0] == "location":
        import location
        return location.loco(first[1])
    if (first[0] == "play" or first[0] == "search") and first[1] == "youtube":
        del (first[0])
        del (first[0])
        a = "+".join(first)
        b = " ".join(first)
        import urllib.request
        import urllib.parse
        import re

        query_string = urllib.parse.urlencode({"search_query": a})
        html_content = urllib.request.urlopen(
            "http://www.youtube.com/results?" + query_string)
        search_results = re.findall(r'href=\"\/watch\?v=(.{11})',
                                    html_content.read().decode())
        print("playing:" + a)
        return webbrowser.open("http://www.youtube.com/watch?v=" +
                               search_results[0])
    if first[0] == "google" or first[0] == "search":
        del (first[0])
        a = "+".join(first)
        return webbrowser.open('https://www.google.co.in/search?q=' + a)
    if first[0] == "connect":
        del (first[0])
        a = "".join(first)
        return webbrowser.open(a + ".com")
    if first[0] == "who":
        del (first[0])
        a = "".join(first)
        from wikiapi import WikiApi
        wiki = WikiApi()
        wiki = WikiApi({'locale': 'en'})
        results = wiki.find(a)
        article = wiki.get_article(results[0])
        print(article.summary)
        return webbrowser.open(article.image)

    while (1):
        if data in wikipedia:
            wiki()
            break
        if data in status:
            cpustatus()
            break
        if data in welcome:
            speak("hi there")
            break
        if data in play:
            speak("ok sir")
            playsong()
            break
        if data in newfile:
            writefile()
            break
        if data in readfile:
            readfile()
            break
        if data in searchweb:
            speak("ok sir")
            search()
            break
        if data in time:
            speak(ctime())
            break
        if "close notepad" in data:
            clsnotepad()
            break
        if "close video" in data:
            clsvlc()
            break
        if "close browser" in data:
            clsbrowser()
            break
        if data in display:
            log.display()
            break
        if data in end:
            com = "close"
            return com
            break
        if data in shutdownpc:
            shutdown()
            break
        if data in folders:
            directory()
            break
        if data in closeprogram:
            close()
            break
        else:
            print("I don't understand the command!! Try again")
            break

예제 #23

0

파일 보기

파일: test_wikiapi.py 프로젝트: richardARPANET/wiki-api

class TestWiki:
    @pytest.fixture(autouse=True)
    def set_up(self):
        self.wiki = WikiApi()
        self.results = self.wiki.find('Bill Clinton')
        self.article = self.wiki.get_article(self.results[0])

    def test_heading(self):
        assert self.article.heading == 'Bill Clinton'

    def test_image(self):
        assert_url_valid(url=self.article.image)

    def test_summary(self):
        results = self.wiki.find('Tom Hanks')
        article = self.wiki.get_article(results[0])

        assert 'Thomas' in article.summary
        assert 'Jeffrey' in article.summary
        assert 'Hanks' in article.summary

    def test_content(self):
        assert len(self.article.content) > 200

    def test_references(self):
        assert isinstance(self.article.references, list) is True

    def test_url(self):
        assert_url_valid(url=self.article.url)
        assert self.article.url == 'https://en.wikipedia.org/wiki/Bill_Clinton'

    def test_get_relevant_article(self):
        keywords = ['president', 'hilary']
        _article = self.wiki.get_relevant_article(self.results, keywords)

        assert 'Bill Clinton' in _article.heading
        assert len(_article.content) > 5000
        assert 'President Bill Clinton' in _article.content

    def test_get_relevant_article_no_result(self):
        keywords = ['hockey player']
        _article = self.wiki.get_relevant_article(self.results, keywords)
        assert _article is None

    def test__remove_ads_from_content(self):
        content = (
            'From Wikipedia, the free encyclopedia. \n\nLee Strasberg '
            '(November 17, 1901 2013 February 17, 1982) was an American '
            'actor, director and acting teacher.\n'
            'Today, Ellen Burstyn, Al Pacino, and Harvey Keitel lead this '
            'nonprofit studio dedicated to the development of actors, '
            'playwrights, and directors.\n\nDescription above from the '
            'Wikipedia article\xa0Lee Strasberg,\xa0licensed under CC-BY-SA, '
            'full list of contributors on Wikipedia.')

        result_content = self.wiki._remove_ads_from_content(content)

        expected_content = (
            ' \n\nLee Strasberg '
            '(November 17, 1901 2013 February 17, 1982) was an American '
            'actor, director and acting teacher.\n'
            'Today, Ellen Burstyn, Al Pacino, and Harvey Keitel lead this '
            'nonprofit studio dedicated to the development of actors, '
            'playwrights, and directors.')
        assert expected_content == result_content

    @pytest.mark.parametrize(
        'url, expected_tables',
        [
            (
                'https://en.wikipedia.org/wiki/World_population',
                [
                    'Population by continent',
                    '10 most populous countries',
                    '10 most densely populated countries',
                    'Countries ranking highly in both total population and '
                    'population density',
                ],
            ),
            (
                'https://en.wikipedia.org/wiki/List_of_countries_and_'
                'dependencies_by_population',
                ['Sovereign states and dependencies by population'],
            ),
            (
                'https://en.wikipedia.org/wiki/Influenza',
                [],
            ),
            ('https://en.wikipedia.org/wiki/Germany', ['Constituent states']),
            (
                'https://en.wikipedia.org/wiki/Chess_Classic',
                [
                    'Chess Classic Championship',
                    # 'Rapid Chess Open',
                    # 'Chess960 Rapid chess World Championship',
                    'FiNet Open Chess960',
                    # 'Chess960 Computer World Championship',
                ],
            ),
            (
                'https://en.wikipedia.org/wiki/List_of_missions_to_the_Moon',
                ['Missions by date'],
            ),
            (
                'https://en.wikipedia.org/wiki/'
                'List_of_people_who_have_walked_on_the_Moon',
                ['Moonwalkers'],
            )
        ],
    )
    def test_get_tables_returns_expected_keys(self, url, expected_tables):
        tables = self.wiki.get_tables(url=url)

        assert list(tables.keys()) == expected_tables

    def test_get_tables(self, mocker):
        url = ('https://en.wikipedia.org/wiki/'
               'COVID-19_pandemic_by_country_and_territory')

        tables = self.wiki.get_tables(url=url)

        assert tables
        assert isinstance(tables, dict)
        assert tuple(tables.keys()) == (
            'COVID-19 pandemic by location 20 September 2020',
            'COVID-19 cases and deaths by region, '
            'in absolute figures and per million '
            'inhabitants as of 5 September 2020',
            'First COVID-19 cases by country or territory',
            'States with no confirmed COVID-19 cases',
            'Partially recognized states with no confirmed cases',
            'Dependencies with no confirmed cases',
        )
        assert tables['Dependencies with no confirmed cases'].T.to_dict() == {
            0: {
                'Rank': 1,
                'Territory': 'American Samoa',
                'Population': 56700,
                'Status': 'Unincorporated territory',
                'Country': 'United States',
                'Continent': 'Oceania',
                'Ref.': mocker.ANY,
            },
            1: {
                'Rank': 2,
                'Territory': 'Cook Islands',
                'Population': 15200,
                'Status': 'Associated state',
                'Country': 'New Zealand',
                'Continent': 'Oceania',
                'Ref.': mocker.ANY,
            },
            2: {
                'Rank': 3,
                'Territory': 'Wallis and Futuna',
                'Population': 11700,
                'Status': 'Overseas collectivity',
                'Country': 'France',
                'Continent': 'Oceania',
                'Ref.': mocker.ANY,
            },
            3: {
                'Rank': 4,
                'Territory': 'Saint Helena, Ascension and Tristan da Cunha',
                'Population': 5633,
                'Status': 'Overseas territory',
                'Country': 'United Kingdom',
                'Continent': 'Africa',
                'Ref.': mocker.ANY,
            },
            4: {
                'Rank': 5,
                'Territory': 'Svalbard',
                'Population': 2667,
                'Status': 'Unincorporated area',
                'Country': 'Norway',
                'Continent': 'Europe',
                'Ref.': mocker.ANY,
            },
            5: {
                'Rank': 6,
                'Territory': 'Christmas Island',
                'Population': 1955,
                'Status': 'External territory',
                'Country': 'Australia',
                'Continent': 'Asia',
                'Ref.': mocker.ANY,
            },
            6: {
                'Rank': 7,
                'Territory': 'Norfolk Island',
                'Population': 1735,
                'Status': 'External territory',
                'Country': 'Australia',
                'Continent': 'Oceania',
                'Ref.': mocker.ANY,
            },
            7: {
                'Rank': 8,
                'Territory': 'Niue',
                'Population': 1520,
                'Status': 'Associated state',
                'Country': 'New Zealand',
                'Continent': 'Oceania',
                'Ref.': mocker.ANY,
            },
            8: {
                'Rank': 9,
                'Territory': 'Tokelau',
                'Population': 1400,
                'Status': 'Dependent territory',
                'Country': 'New Zealand',
                'Continent': 'Oceania',
                'Ref.': mocker.ANY,
            },
            9: {
                'Rank': 10,
                'Territory': 'Cocos (Keeling) Islands',
                'Population': 555,
                'Status': 'External territory',
                'Country': 'Australia',
                'Continent': 'Asia',
                'Ref.': mocker.ANY,
            },
            10: {
                'Rank': 11,
                'Territory': 'Pitcairn Islands',
                'Population': 50,
                'Status': 'Overseas territory',
                'Country': 'United Kingdom',
                'Continent': 'Oceania',
                'Ref.': mocker.ANY,
            },
        }
        assert tables['States with no confirmed COVID-19 cases'].T.to_dict(
        ) == {
            0: {
                'Rank': 1,
                'Country': 'North Korea[a]',
                'Population': 25778816,
                'Continent': 'Asia',
                'Ref.': mocker.ANY,
            },
            1: {
                'Rank': 2,
                'Country': 'Turkmenistan[a]',
                'Population': 6031200,
                'Continent': 'Asia',
                'Ref.': mocker.ANY,
            },
            2: {
                'Rank': 3,
                'Country': 'Solomon Islands',
                'Population': 686884,
                'Continent': 'Oceania',
                'Ref.': mocker.ANY,
            },
            3: {
                'Rank': 4,
                'Country': 'Vanuatu',
                'Population': 307145,
                'Continent': 'Oceania',
                'Ref.': mocker.ANY,
            },
            4: {
                'Rank': 5,
                'Country': 'Samoa',
                'Population': 198413,
                'Continent': 'Oceania',
                'Ref.': mocker.ANY,
            },
            5: {
                'Rank': 6,
                'Country': 'Kiribati',
                'Population': 119451,
                'Continent': 'Oceania',
                'Ref.': mocker.ANY,
            },
            6: {
                'Rank': 7,
                'Country': 'Micronesia',
                'Population': 115030,
                'Continent': 'Oceania',
                'Ref.': mocker.ANY,
            },
            7: {
                'Rank': 8,
                'Country': 'Tonga',
                'Population': 105695,
                'Continent': 'Oceania',
                'Ref.': mocker.ANY,
            },
            8: {
                'Rank': 9,
                'Country': 'Marshall Islands',
                'Population': 59190,
                'Continent': 'Oceania',
                'Ref.': mocker.ANY,
            },
            9: {
                'Rank': 10,
                'Country': 'Palau',
                'Population': 18094,
                'Continent': 'Oceania',
                'Ref.': mocker.ANY,
            },
            10: {
                'Rank': 11,
                'Country': 'Tuvalu',
                'Population': 11793,
                'Continent': 'Oceania',
                'Ref.': mocker.ANY,
            },
            11: {
                'Rank': 12,
                'Country': 'Nauru',
                'Population': 10823,
                'Continent': 'Oceania',
                'Ref.': mocker.ANY,
            },
        }

예제 #24

0

파일 보기

파일: wiki_getarticles.py 프로젝트: PalashMatey/AdvBigData

from wikiapi import WikiApi
wiki = WikiApi()
wiki = WikiApi({ 'locale' : 'en'})

keywords=[]

with open("Important_Names.txt","r") as f:
	for line in f:
		keywords.append(line)
f.close()

count=0
for word in keywords:
	count=count+1
	results = wiki.find(word.strip('\n'))
	if len(results)!=0:
		article = wiki.get_article(results[0])
		text=article.content.encode('utf-8')
		with open("Web"+str(count)+".txt","w") as f:
			f.write(text)
		f.close()
		print article.url

예제 #25

0

파일 보기

파일: wikipedia.py 프로젝트: manishc1/DySeCor

class WikiGrabber(object):
    """
    Class to grab the wiki articles.
    """

    def __init__(self, filenames):
        """
        Initialize the WikiGrabber class.
        """
        self.glossary = Glossary(filenames)
        self.wiki = WikiApi({})


    def get_articles(self, dir_name):
        """
        Get wiki articles for all the phrases and convert to xml.
        """
        global visited_results
        step = 1000 + len(visited_results)
        try:
            for phrase, flag in self.glossary.phrases:
                print phrase
                results = self.wiki.find(phrase)
                for result in results:
                    if (result not in visited_results):
                        article = self.wiki.get_article(result)
                        self.article_to_xml(article, flag, dir_name)
                        visited_results.append(result)
                        if (len(visited_results) > step):
                            print phrase, len(visited_results)
                            step = step + 1000
        except:
            print phrase, len(visited_results)


    def article_to_xml(self, article, flag, dir_name):
        """
        Create a xml from the article.
        """
        try:
            docId = 'Wiki_' + datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')
            docType = 'Wiki'
            docSource = 'wikipedia'
            docDate = ''
            docTitle = article.heading
            docDesc = clean(article.summary)

            if (len(docDesc.split()) < WORD_LEN_THRESHOLD):
                return 

            if (flag and ('security' not in docDesc.lower())):
                return

            document = lb.E.Document(
                lb.E.Title(docTitle),
                lb.E.Date(docDate),
                lb.E.Description(docDesc),
                id=docId, type=docType, src=docSource)		
            doc = etree.tostring(document, pretty_print=True)

            xml_filename = dir_name + docId + '.xml'
            writeString(xml_filename, XML_HEAD + doc)
        except Exception as e:
            print e

예제 #26

0

파일 보기

def getWikiArticle(word, locale):
    wiki = WikiApi({ 'locale' : locale})
    results = wiki.find(word)
    result = next(iter(results or []), None)
    return wiki.get_article(result) if result else None

예제 #27

0

파일 보기

파일: WeatherApp.py 프로젝트: rutvik1010/WeatherApp

def main():
    status = True
    pygame.mixer.music.play(-1)
    music_status = 1
#   Create a wikiapi instance
    wiki_status = 1
    wiki_instance = WikiApi()
    wiki_instance = WikiApi({'locale': 'en'})
    namespace = None

    index1 = 0
    data_list = []

#   Load weather data into lists and dictionaries
    weather_location = 0
    connector = yweather.Client()
    weather_id_ny = connector.fetch_woeid('New York')
    weather_data_ny = connector.fetch_weather(str(weather_id_ny), metric=True)
    data_dict_ny = {}
    data_dict_ny.update({'Current Temperature': weather_data_ny["condition"]["temp"], \
                    'Sunrise': weather_data_ny['astronomy']['sunrise'],\
                         'Sunset': weather_data_ny['astronomy']['sunset'],
                    'Max Temperature': (str(weather_data_ny['forecast'][0]['high']) + " Degrees C"), \
                    'Min Temperature': (str(weather_data_ny['forecast'][0]['low'] + " Degrees C")),
                    'Wind': (str(weather_data_ny['wind']['speed'] + " km/h")), \
                    'Condition': weather_data_ny['condition']['text']})
    keys_list_ny = data_dict_ny.keys()

    weather_id_buffalo = connector.fetch_woeid('Buffalo')
    weather_data_buffalo = connector.fetch_weather(str(weather_id_buffalo), metric=True)
    data_dict_buffalo = {}
    data_dict_buffalo.update({'Current Temperature': weather_data_buffalo["condition"]["temp"], \
                    'Sunrise': weather_data_buffalo['astronomy']['sunrise'],\
                              'Sunset': weather_data_buffalo['astronomy']['sunset'],
                    'Max Temperature': (str(weather_data_buffalo['forecast'][0]['high']) + " Degrees C"), \
                    'Min Temperature': (str(weather_data_buffalo['forecast'][0]['low'] + " Degrees C")),
                    'Wind': (str(weather_data_buffalo['wind']['speed'] + " km/h")), \
                    'Condition': weather_data_buffalo['condition']['text']})
    keys_list_buffalo = data_dict_buffalo.keys()

    weather_id_hyd = connector.fetch_woeid('Hyderabad')
    weather_data_hyd = connector.fetch_weather(str(weather_id_hyd), metric=True)
    data_dict_hyd = {}
    data_dict_hyd.update({'Current Temperature': weather_data_hyd["condition"]["temp"], \
                    'Sunrise': weather_data_hyd['astronomy']['sunrise'], \
                          'Sunset': weather_data_hyd['astronomy']['sunset'],
                    'Max Temperature': (str(weather_data_hyd['forecast'][0]['high']) + " Degrees C"), \
                    'Min Temperature': (str(weather_data_hyd['forecast'][0]['low'] + " Degrees C")),
                    'Wind': (str(weather_data_hyd['wind']['speed'] + " km/h")), \
                    'Condition': weather_data_hyd['condition']['text']})
    keys_list_hyd = data_dict_hyd.keys()

    while status:
            main_display.fill(black)
            pointer_location = pygame.mouse.get_pos()
            pointer_click = pygame.mouse.get_pressed()
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    pygame.quit()
                    
#   Music Button
                if 325 < pointer_location[0] < 405 and 20 < pointer_location[1] < 50:
                    if pointer_click[0] == 1:
                        wiki_status = 1
                if 700 < pointer_location[0] < 780 and 20 < pointer_location[1] < 50:
                    if pointer_click[0] == 1:
                        music_status = not music_status
                        if music_status == 0:
                            pygame.mixer.music.pause()
                        else:
                            pygame.mixer.music.unpause()
#   New York  Button Check
                if 20 < pointer_location[0] < 80 and 20 < pointer_location[1] < 50:
                    if pointer_click[0] == 1:
                        weather_location = 2
                    
#   Buffalo  Button Check
                if 100 < pointer_location[0] < 160 and 20 < pointer_location[1] < 50:
                    if pointer_click[0] == 1:
                        weather_location = 1
                    
#   Hyderabad  Button Check
                if 180 < pointer_location[0] < 240 and 20 < pointer_location[1] < 50:
                    if pointer_click[0] == 1:
                        weather_location = 0
            try:
                main_display.blit(weather_image, (0,0))
            except:
                pass
            
#   Data Display
            if weather_location == 0:
                data_display(110, data_dict_hyd['Current Temperature'], white, 80, 160)  # Temperature number
                data_display(20, "Deg C", white, 180, 130)                           # Degree
                data_display(15, keys_list_hyd[5] + " : " + data_dict_hyd['Condition'], white, 95, 260)  # Condition
                data_display(15, keys_list_hyd[1] + " : " + data_dict_hyd['Min Temperature'], white, 130, 320)
                data_display(15, keys_list_hyd[6] + " : " + data_dict_hyd['Max Temperature'], white, 130, 360)
                data_display(15, keys_list_hyd[4] + " : " + data_dict_hyd['Sunrise'], white, 95, 400)   # Sunrise
                data_display(15, keys_list_hyd[0] + " : " + data_dict_hyd['Sunset'], white, 95, 440)    # Sunset
                data_display(15, keys_list_hyd[3] + " : " + data_dict_hyd['Wind'], white, 95, 480)  # Wind Speed

            elif weather_location == 1:
                data_display(110, data_dict_buffalo['Current Temperature'], white, 80, 160)  # Temperature number
                data_display(20, "Deg C", white, 180, 130)                           # Degree
                data_display(15, keys_list_buffalo[5] + " : " + data_dict_buffalo['Condition'], white, 95, 260)
                data_display(15, keys_list_buffalo[1] + " : " + data_dict_buffalo['Min Temperature'], white, 130, 320)
                data_display(15, keys_list_buffalo[6] + " : " + data_dict_buffalo['Max Temperature'], white, 130, 360)
                data_display(15, keys_list_buffalo[4] + " : " + data_dict_buffalo['Sunrise'], white, 95, 400)
                data_display(15, keys_list_buffalo[0] + " : " + data_dict_buffalo['Sunset'], white, 95, 440)
                data_display(15, keys_list_buffalo[3] + " : " + data_dict_buffalo['Wind'], white, 95, 480)


            elif weather_location == 2:
                data_display(110, data_dict_ny['Current Temperature'], white, 80, 160)  # Temperature number
                data_display(20, "Deg C", white, 180, 130)                           # Degree
                data_display(15, keys_list_ny[5] + " : " + data_dict_ny['Condition'], white, 95, 260)  # Condition
                data_display(15, keys_list_ny[1] + " : " + data_dict_ny['Min Temperature'], white, 130, 320)
                data_display(15, keys_list_ny[6] + " : " + data_dict_ny['Max Temperature'], white, 130, 360)
                data_display(15, keys_list_ny[4] + " : " + data_dict_ny['Sunrise'], white, 95, 400)   # Sunrise
                data_display(15, keys_list_ny[0] + " : " + data_dict_ny['Sunset'], white, 95, 440)    # Sunset
                data_display(15, keys_list_ny[3] + " : " + data_dict_ny['Wind'], white, 95, 480)  # Wind Speed
                
#   Display Wiki Article
            if wiki_status == 1:
                del data_list[:]
                wiki_status = 0
                blahblah = True
                try:
                    url = 'http://en.wikipedia.org/wiki/Special:Random'
                    if namespace != None:
                        url += '/' + namespace
                    req = urllib2.Request(url, None, { 'User-Agent' : 'x'})
                    page = urllib2.urlopen(req).readlines()
                    wiki_draft1 = remove_tags(page[4])
                    wiki_title = wiki_draft1[:wiki_draft1.index('Wikipedia') - 2]
                    wiki_data_list = wiki_instance.find(wiki_title)
                    wiki_data = wiki_instance.get_article(wiki_data_list[0])
                    temp = endlinefunction(wiki_data.summary, data_list, 90)
                except (urllib2.HTTPError, urllib2.URLError):
                    print "Failed to get article"
                    raise
                    
#   Buttons and Division Display
            pygame.draw.rect(main_display, white, (300, 0, 5, 600))
            pygame.draw.rect(main_display, white, (300, 70, 500, 5))
            drawbutton(wood, 700, 20, 80, 30, 10, "Toggle Music", black)
            drawbutton(white, 20, 20, 60, 30, 10, "New York", black)
            drawbutton(white, 100, 20, 60, 30, 10, "Buffalo", black)
            drawbutton(white, 180, 20, 60, 30, 10, "Hyderabad", black)
            drawbutton(wood, 325, 20, 80, 30, 10, "Next Article", black)
            
#   Cursor Display
            data_display(15, wiki_data.heading, wood, 540, 130)
            y_cood = 150
            j = 25
            for i in range(0, len(data_list)):
                y_cood = y_cood + j
                data_display(10, data_list[i], black, 540, y_cood)
            clock.tick(100)
            pygame.display.flip()

예제 #28

0

파일 보기

파일: scraper.py 프로젝트: guillermovera92/WikiRecommender

class Scraper:

    prohibited_headers = set(['Contents', 'See also', 'References'])

    # The scraper uses the classifier to only send out articles that are more likely to
    # be music related
    def __init__(self):
        self.classifier = classifier.Classifier()
        self.wiki = WikiApi()
        self.bad_urls = set([p['url'] for p in self.classifier.non_accepted_pages])

    # The stream method is used for scraping a large number of maximum links.
    # This method does not implement the classifier filtering because its main
    # purpose is for building the database of pages for manual classification
    def stream(self, start_term, maxLinks):
        finished, queue, search_results = self.scrape_common(start_term)

        for i in range(maxLinks):
            if queue.empty():
                break
            current_url = queue.get()
            while current_url in finished:
                current_url = queue.get()
            (page, urls) = self.process_page(current_url)
            finished.add(current_url)
            for u in urls:
                queue.put(u)
            yield page

    # The scrape method is used for a smaller number of maximum links. It performs
    # a breadth first search given an initial term. It uses a queue to keep track
    # of the pages to be scraped and a set of the already scraped to prevent 
    # duplicates
    def scrape(self, start_term, maxLinks):
        finished, queue, search_results = self.scrape_common(start_term)
        pages = []

        while len(pages) < maxLinks:
            if queue.empty():
                break
            current_url = queue.get()
            while current_url in finished:
                current_url = queue.get()
            (page, urls) = self.process_page(current_url)
            finished.add(current_url)

            # Only if the classifier predicts it as a good page, a page will
            # be added to the pages list which is returned at the end
            if self.classifier.classify(page) == 1 and page.url not in self.bad_urls:
                pages.append(page)
                print page.name
            for u in urls:
                queue.put(u)
        return pages

    # Common code for both methods that crawl wikipedia
    def scrape_common(self, start_term):
        finished = set()
        queue = Queue()
        search_results = self.wiki.find(start_term)
        if not search_results:
            print 'No pages found. Try a different term'
        else:
            queue.put('https://en.wikipedia.org/wiki/' + search_results[0])
        return finished, queue, search_results

    # Process a page's HTML using BeautifulSoup to extract useful information
    def process_page(self, url):
        html = self.wiki.get(url)

        soup = BeautifulSoup(html)
        body_html = soup.find(id='mw-content-text')
        title_tag = soup.find(id='firstHeading')
        if title_tag.string == None:
            contents = title_tag.contents
            string_contents = []
            for c in contents:
                if type(c) != str:
                    string_contents.append(c.string)
                else:
                    string_contents.append(c)
            title = ''.join(string_contents)
        else:
            title = title_tag.string

        urls, links_text, media_link_count = self.find_urls(body_html)
        (clean_text, headers) = self.clean_html(body_html)
        page = Page(url, title, clean_text, headers, links_text, media_link_count)
        return (page, urls)

    # Find all URLs in a given HTML that redirect to another article in Wikipedia
    # Page links and media links (pictures, audio) are stored in different lists
    # but are both used.
    def find_urls(self, html):
        link_urls = []
        good_link = re.compile('/wiki/')
        bad_link = re.compile('.*:.*|.*\..*|.*\(disambiguation\)')
        media_link = re.compile('.*\.jpg|.*\.ogg')
        media_link_count = 0
        media_found = set()
        links_text = dd(int)

        all_links = html.find_all('a')
        for l in all_links:
            link = l.get('href')
            content = self.extract_content([l])[0]
            if good_link.match(link) and not bad_link.match(link):
                link_urls.append('https://en.wikipedia.org' + link)

                if str(content) != '':
                    links_text[content] = links_text[content] + 1

            elif media_link.match(link):
                if link not in media_found:
                    media_link_count += 1
                    media_found.add(link)

                if str(content) != '':
                    links_text[content] = links_text[content] + 1

        return (link_urls, links_text, media_link_count)

    # Fucntion to extract the body and the headers of an article
    def clean_html(self, html):
        paragraphs = html.find_all('p')
        headers = html.find_all(re.compile('h\d'))
        clean_text = ''.join(self.extract_content(paragraphs))
        headers_list = self.clean_headers(headers)
        return (clean_text, headers_list)

    # Clean the list of headers of the prohibited, common headers
    def clean_headers(self, array):
        raw_headers = self.extract_content(array)
        final_headers = []
        for h in raw_headers:
            if h not in Scraper.prohibited_headers:
                final_headers.append(h)
        return final_headers

    # Fuction to clean the HTML body of a page. It removes common links that 
    # would cause noise in our system such as [edit] buttons and reference numbers
    # e.g. [2]. 
    def extract_content(self, array):
        for i in range(len(array)):
            array[i] = re.sub(r'<[^>]*>', '', str(array[i]))
            array[i] = re.sub(r'\[edit\]', '', str(array[i]))
            array[i] = re.sub(r'\[\d*\]', '', str(array[i]))
            array[i] = re.sub(r'\^', '', str(array[i]))
        return array

예제 #29

0

파일 보기

with open('./questions.txt') as f:
    questions = [line[:-1] for line in f]

model = AnswerFinder(config=c, restore=True, mode="inference")
print('\n\n\n\n\n\n\n')
print(
    '''Hello! This is Alpha version of program for reading wikipedia to answer the question.
Program was writing basing on paper https://arxiv.org/pdf/1704.00051.pdf
For more detail [email protected]\n''')

c.inf_threshold = 0.7
while True:
    while True:
        print('What or who do you want to ask about? Example: Barak Obama')
        thing = input()
        results = wiki.find(thing)
        if len(results) > 0:
            print('Ok. I found few wiki pages about {}.'.format(thing))
            break
        else:
            print(
                'Can\'t find any wiki pages about {}. Try another one.'.format(
                    thing))

    article = wiki.get_article(results[0])
    context = article.content
    for question in questions:
        os.system('clear')
        print('Q: {}'.format(question))
        print('Search answers ...')
        answers, probs = tools.get_answer(context, question, model, c)

예제 #30

0

파일 보기

파일: scraper.py 프로젝트: guillermovera92/WikiRecommender

class Scraper:

    prohibited_headers = set(['Contents', 'See also', 'References'])

    # The scraper uses the classifier to only send out articles that are more likely to
    # be music related
    def __init__(self):
        self.classifier = classifier.Classifier()
        self.wiki = WikiApi()
        self.bad_urls = set(
            [p['url'] for p in self.classifier.non_accepted_pages])

    # The stream method is used for scraping a large number of maximum links.
    # This method does not implement the classifier filtering because its main
    # purpose is for building the database of pages for manual classification
    def stream(self, start_term, maxLinks):
        finished, queue, search_results = self.scrape_common(start_term)

        for i in range(maxLinks):
            if queue.empty():
                break
            current_url = queue.get()
            while current_url in finished:
                current_url = queue.get()
            (page, urls) = self.process_page(current_url)
            finished.add(current_url)
            for u in urls:
                queue.put(u)
            yield page

    # The scrape method is used for a smaller number of maximum links. It performs
    # a breadth first search given an initial term. It uses a queue to keep track
    # of the pages to be scraped and a set of the already scraped to prevent
    # duplicates
    def scrape(self, start_term, maxLinks):
        finished, queue, search_results = self.scrape_common(start_term)
        pages = []

        while len(pages) < maxLinks:
            if queue.empty():
                break
            current_url = queue.get()
            while current_url in finished:
                current_url = queue.get()
            (page, urls) = self.process_page(current_url)
            finished.add(current_url)

            # Only if the classifier predicts it as a good page, a page will
            # be added to the pages list which is returned at the end
            if self.classifier.classify(
                    page) == 1 and page.url not in self.bad_urls:
                pages.append(page)
                print page.name
            for u in urls:
                queue.put(u)
        return pages

    # Common code for both methods that crawl wikipedia
    def scrape_common(self, start_term):
        finished = set()
        queue = Queue()
        search_results = self.wiki.find(start_term)
        if not search_results:
            print 'No pages found. Try a different term'
        else:
            queue.put('https://en.wikipedia.org/wiki/' + search_results[0])
        return finished, queue, search_results

    # Process a page's HTML using BeautifulSoup to extract useful information
    def process_page(self, url):
        html = self.wiki.get(url)

        soup = BeautifulSoup(html)
        body_html = soup.find(id='mw-content-text')
        title_tag = soup.find(id='firstHeading')
        if title_tag.string == None:
            contents = title_tag.contents
            string_contents = []
            for c in contents:
                if type(c) != str:
                    string_contents.append(c.string)
                else:
                    string_contents.append(c)
            title = ''.join(string_contents)
        else:
            title = title_tag.string

        urls, links_text, media_link_count = self.find_urls(body_html)
        (clean_text, headers) = self.clean_html(body_html)
        page = Page(url, title, clean_text, headers, links_text,
                    media_link_count)
        return (page, urls)

    # Find all URLs in a given HTML that redirect to another article in Wikipedia
    # Page links and media links (pictures, audio) are stored in different lists
    # but are both used.
    def find_urls(self, html):
        link_urls = []
        good_link = re.compile('/wiki/')
        bad_link = re.compile('.*:.*|.*\..*|.*\(disambiguation\)')
        media_link = re.compile('.*\.jpg|.*\.ogg')
        media_link_count = 0
        media_found = set()
        links_text = dd(int)

        all_links = html.find_all('a')
        for l in all_links:
            link = l.get('href')
            content = self.extract_content([l])[0]
            if good_link.match(link) and not bad_link.match(link):
                link_urls.append('https://en.wikipedia.org' + link)

                if str(content) != '':
                    links_text[content] = links_text[content] + 1

            elif media_link.match(link):
                if link not in media_found:
                    media_link_count += 1
                    media_found.add(link)

                if str(content) != '':
                    links_text[content] = links_text[content] + 1

        return (link_urls, links_text, media_link_count)

    # Fucntion to extract the body and the headers of an article
    def clean_html(self, html):
        paragraphs = html.find_all('p')
        headers = html.find_all(re.compile('h\d'))
        clean_text = ''.join(self.extract_content(paragraphs))
        headers_list = self.clean_headers(headers)
        return (clean_text, headers_list)

    # Clean the list of headers of the prohibited, common headers
    def clean_headers(self, array):
        raw_headers = self.extract_content(array)
        final_headers = []
        for h in raw_headers:
            if h not in Scraper.prohibited_headers:
                final_headers.append(h)
        return final_headers

    # Fuction to clean the HTML body of a page. It removes common links that
    # would cause noise in our system such as [edit] buttons and reference numbers
    # e.g. [2].
    def extract_content(self, array):
        for i in range(len(array)):
            array[i] = re.sub(r'<[^>]*>', '', str(array[i]))
            array[i] = re.sub(r'\[edit\]', '', str(array[i]))
            array[i] = re.sub(r'\[\d*\]', '', str(array[i]))
            array[i] = re.sub(r'\^', '', str(array[i]))
        return array

예제 #31

0

파일 보기

파일: tests.py 프로젝트: speedydeletion/wiki-api

# -*- coding: utf-8 -*-
from wikiapi import WikiApi
import unittest

wiki = WikiApi({})
results = wiki.find('Bill Clinton')
article = wiki.get_article(results[0])  # taking first search result


class TestWiki(unittest.TestCase):
    def test_heading(self):
        self.assertIsNotNone(article.heading)

    def test_image(self):
        self.assertTrue(isinstance(article.image, str))

    def test_summary(self):
        self.assertGreater(len(article.summary), 100)

    def test_content(self):
        self.assertGreater(len(article.content), 200)

    def test_references(self):
        self.assertTrue(isinstance(article.references, list))

    def test_url(self):
        self.assertTrue(article.url,
                        u"http://en.wikipedia.org/wiki/Bill_Clinton")

    def test_get_relevant_article(self):
        keywords = ['president', 'hilary']

예제 #32

0

파일 보기

파일: pinkiwiki.py 프로젝트: dbarenas/evetorer3curant

		if i not in stpw:
 			keyw.append(i)

	return keyw
#************************************
#************************************
	
wiki = WikiApi({})
dic_cont={} #diccionary
mlist=[] #word base
#************************************

for wtopic in file1.readlines():
	w=wtopic.split()
 	mlist.append(w[0])
	results = wiki.find(w[0])
	if results:

		article = wiki.get_article(results[0])
 		r=article.content 
 		rtoken= wordpunct_tokenize(r)
 		
 		#implementation of stopwords
		stopwords = nltk.corpus.stopwords.words('english')
		content = [wip for wip in rtoken if wip.lower() not in stopwords]
		#implementation if there are a characters into the system
		fcontent = [wip for wip in content if re.sub(r'[^A-Za-z]', "", wip)]
		gcontent=[]
		for i in fcontent:
			gcontent.append(i.encode('utf-8').lower().strip())