Python Wikipedia.search примеры, pattern.web.Wikipedia.search Python примеры использования

Пример #1

0

Показать файл

Файл: Wikifier.py Проект: fbkarsdorp/dbnl

    def extract_unique_nes(self, input_dir='../workspace/frog_periodicals', fresh=False,
                            max_documents=None, max_words_per_doc=None,
                            filename='../workspace/nes2wikilinks.p'):
        """
        Extracts all unique entities in the frogged files under input_dir as a dict.
        Registers in this dict: which relevant wiki-pages the NE could refer to
        according to the Wikipedia search interface.
        Only considers NEs that are:
            * capitalized
            * have len > 3 (cf. 'Van')
            * don't end in a dot (e.g. 'A.F.Th.')
            * tagged as B-PER by Frog
        """
        if fresh:
            print('Extracting NEs from ', max_documents, 'documents!')
            wikipedia = Wikipedia(language='nl', throttle=3)
            self.nes2wikilinks = {}

            for filepath in glob.glob(input_dir+'/*.txt.out'):
                max_words = max_words_per_doc
                for line in codecs.open(filepath, 'r', 'utf8'):
                    try:
                        comps = [c for c in line.strip().split('\t') if c]
                        idx, token, lemma, pos, conf, ne  = comps
                        token = token.replace('_', ' ')
                        if ne.startswith('B-PER') and token[0].isupper() and len(token) > 3 and not token.endswith('.'):
                            if token not in self.nes2wikilinks:
                                try: # to look up the page in wikipedia:
                                    article = wikipedia.search(token)
                                    if article: # if we find something...
                                        if article.categories[0] == 'Wikipedia:Doorverwijspagina': # we are dealing a referral page
                                            for link in article.links:
                                                if link in self.page_ids:
                                                    if token not in self.nes2wikilinks:
                                                        self.nes2wikilinks[token] = set()
                                                    self.nes2wikilinks[token].add(link)
                                        else:
                                            if article.title in self.page_ids:
                                                self.nes2wikilinks[token] = set([article.title])
                                except: # probably a download issue...
                                    continue
                        max_words -= 1
                        if max_words < 0:
                            break
                    except ValueError: # probably parsing error in the frog file
                        continue

                # update stats:
                max_documents -= 1
                if max_documents % 10 == 0:
                    print('\t+ ', max_documents, 'documents to go')
                    print('\t+ ', len(self.nes2wikilinks), 'NEs collected')
                if max_documents < 0:
                    break

            pickle.dump(self.nes2wikilinks, open(filename, 'wb'))

        else:
            self.nes2wikilinks = pickle.load(open(filename, 'rb'))

Пример #2

0

Показать файл

Файл: dataScrapper.py Проект: Luke-lujunxian/Auto-TTP-Breaker

def getFeaturedList():
    wiki = Wikipedia()
    article = wiki.search("Wikipedia:Featured articles")
    file = open("articalsTitle.txt", 'w')
    for section in article.sections:
        if section.string != "":
            for title in article.string.split('\n'):
                file.write(((str)(title)).strip() + "\n")
    file.close()

Пример #3

0

Показать файл

Файл: tests.py Проект: anddonram/Painting4U

def testWikipedia(palabra):
    """
    Obtiene el articulo de la wikipedia
    """
    engine = Wikipedia(license=None, throttle=5.0)
    resultados = engine.search(palabra)

    print resultados.plaintext()
    return resultados

Пример #4

0

Показать файл

Файл: Wikifier.py Проект: fbkarsdorp/dbnl

    def mentions_from_backlinks(self, backlinks={}, fresh=False, filename='../workspace/mentions.p', context_window_size=150):
        """
        Mines backlinking pages for mentions of the page_ids in backlinks.
        Returns 5 tuples, with for each mention:
            * target_id (correct page title)
            * the name variant (inside the a-tag)
            * left context of the mention (continguous character string, with len = context_window_size)
            * right context of the mention (continguous character string, with len = context_window_size)
            * a counter of other page_ids mentioned on the page
        """
        if not backlinks:
            backlinks = self.backlinks

        # intitialize data containers:
        target_ids, name_variants, left_contexts, right_contexts, page_counts = [], [], [], [], []

        if fresh:
            print('>>> Mining mentions from', sum([len(v) for k,v in backlinks.items()]),
                  'backlinking pages to', len(backlinks), 'target pages')
            print(backlinks)
            wikipedia = Wikipedia(language='nl', throttle=2)

            for idx, (page_id, links) in enumerate(backlinks.items()):
                print('\t + mining mentions of', page_id, '('+str(len(links)), 'backlinks) | page', idx+1, '/', len(backlinks))
                for backlink in links:
                    try:
                        article = wikipedia.search(backlink) # fetch the linking page via pattern
                        if not article.categories[0] == 'Wikipedia:Doorverwijspagina': # skip referral pages
                            print('\t\t* backlink:', backlink)
                            section_sources = [] # fetch the html-sections of individual sections:
                            if not article.sections: # article doesn't have sections
                                section_sources = [article.source]
                            else:
                                section_sources = [section.source for section in article.sections]
                            # loop over the section sources and extract all relevant mentions:
                            for section_source in section_sources:
                                ts, nvs, lcs, rcs, cnts = self.mentions_from_section(source=section_source,
                                                                                     target_id=page_id,
                                                                                     context_window_size=context_window_size)
                                if nvs:
                                    target_ids.extend(ts)
                                    name_variants.extend(nvs)
                                    left_contexts.extend(lcs)
                                    right_contexts.extend(rcs)
                                    page_counts.extend(cnts)
                    except:
                        continue

            pickle.dump((target_ids, name_variants, left_contexts, right_contexts, page_counts), open(filename, 'wb'))

        else:
            target_ids, name_variants, left_contexts, right_contexts, page_counts = \
                                                        pickle.load(open(filename, 'rb'))

        self.mentions = (target_ids, name_variants, left_contexts, right_contexts, page_counts)

Пример #5

0

Показать файл

Файл: properties.py Проект: cartisan/goldfinger

def celebrities():
    """ Returns a list of celebrities from Wikipedia.
    """
    celebrities = set()
    w = Wikipedia(language="en")
    p = w.search("List of people on the cover of Rolling Stone", cached=True)
    s = p.plaintext()
    for section in p.sections:
        if section.parent and section.parent.title in ("Individuals", ):
            for name in section.links:
                if "* " + name in s:
                    celebrities.add(name)
    return celebrities

Пример #6

0

Показать файл

Файл: dataScrapper.py Проект: Luke-lujunxian/Auto-TTP-Breaker

def getFeaturedContent():
    wiki = Wikipedia()
    list = open("articalsTitle.txt", 'r')
    file = open("wikiData.txt", 'w')
    for i in range(2000):
        title = list.readline().replace("\n", "")
        article = wiki.search(title)
        if article is not None:
            for section in article.sections:
                if section.string != "":
                    file.write(section.string + "\n")
            time.sleep(0.2)
            print(title + " Get! " + str(i) + "/2000")

Пример #7

0

Показать файл

Файл: properties.py Проект: OAlm/the_stromberg_stories

def celebrities():
    """ Returns a list of celebrities from Wikipedia.
    """
    celebrities = set()
    w = Wikipedia(language="en")
    p = w.search("List of people on the cover of Rolling Stone", cached=True)
    s = p.plaintext()
    for section in p.sections:
        if section.parent and section.parent.title in ("Individuals",):
            for name in section.links:
                if "* " + name in s:
                    celebrities.add(name)
    return celebrities

Пример #8

0

Показать файл

Файл: newsgrapher.py Проект: ageek/sentiment

def isnews(topic):
    engine = Wikipedia()
    result = engine.search(topic)
    if result:
        if topic.lower() not in result.title.lower():
            return False
        newsthings = ['places','cities','capitals','countries','people','wars']
        categories = result.categories
        for category in categories:
            for thing in newsthings:
                if thing in category.lower():
                    return True
        return False
    else:
        return False

Пример #9

0

Показать файл

Файл: celebrities.py Проект: OAlm/the_stromberg_stories

def gender(name):
    """ Returns the gender of the given person (m/f).
    """
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        s = plaintext(p.string)
        s = s.lower()
        s = s.replace("\n", "\n ")
        m = sum(s.count(" %s " % x) for x in ( "he", "his"))
        f = sum(s.count(" %s " % y) for y in ("she", "her"))
        g = m > f and "m" or "f" # More "he" or more "she"?
        return g
    except:
        return None

Пример #10

0

Показать файл

Файл: celebrities.py Проект: romanorac/botsvsquotes

def gender(name):
    """ Returns the gender of the given person (m/f).
    """
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        s = plaintext(p.string)
        s = s.lower()
        s = s.replace("\n", "\n ")
        m = sum(s.count(" %s " % x) for x in ("he", "his"))
        f = sum(s.count(" %s " % y) for y in ("she", "her"))
        g = m > f and "m" or "f"  # More "he" or more "she"?
        return g
    except:
        return None

Пример #11

0

Показать файл

def crawl_wiki():
    engine = Wikipedia(license=None, throttle=1.0, language='en')
    wikis = {}
    keywords = get_keywords()
    for keyword in keywords:
        stop = False
        while stop is False:
            try:
                article = engine.search(query=keyword)
            except Exception as e:
                print str(e)
                article = None

            if type(article) is pattern.web.WikipediaArticle:
                if article.disambiguation is False:
                    print '\nretrieving', keyword, '...',
                    wikis[keyword] = {}
                    wikis[keyword]['keyword'] = keyword
                    wikis[keyword]['text'] = article.plaintext()
                    stop = True
                else:
                    print '\n[', keyword, '] leads to disambiguation page!',
                    stop = True

                    if '-' in keyword:
                        keyword = re.sub(
                            '-', ' ',
                            keyword)  # convert hyphen into white space
                        stop = False
                    if keyword.islower() and len(keyword) <= 5:
                        keyword = keyword.upper()
                        stop = False
            else:
                print '\n[', keyword, '] doesn\'t exist on wikipedia!',
                stop = True

                if '-' in keyword:
                    keyword = re.sub(
                        '-', ' ', keyword)  # convert hyphen into white space
                    stop = False
                if keyword.islower() and len(keyword) <= 5:
                    keyword = keyword.upper()
                    stop = False

    enpickle(wikis, 'data/txt/wiki.pkl')
    return wikis

Пример #12

0

Показать файл

Файл: celebrities.py Проект: romanorac/botsvsquotes

def celebrities():
    """ Returns a list of celebrities from Wikipedia.
    """
    celebrities = set()
    w = Wikipedia(language="en")
    p = w.search("List of people on the cover of Rolling Stone", cached=True)
    s = p.plaintext()
    # Extract the links from this page, excluding links in the footnotes section,
    # or links to band names (we just want names of individuals).
    for section in p.sections:
        if section.parent and section.parent.title in ("Individuals", ):
            for name in section.links:
                # Yes = * [Rob Zombie], musician and filmmaker
                # No  = * Mark White, bass guitarist for [Spin Doctors]
                if "* " + name in s:
                    celebrities.add(name)
    return celebrities

Пример #13

0

Показать файл

def isnews(topic):
    engine = Wikipedia()
    result = engine.search(topic)
    if result:
        if topic.lower() not in result.title.lower():
            return False
        newsthings = [
            'places', 'cities', 'capitals', 'countries', 'people', 'wars'
        ]
        categories = result.categories
        for category in categories:
            for thing in newsthings:
                if thing in category.lower():
                    return True
        return False
    else:
        return False

Пример #14

0

Показать файл

Файл: celebrities.py Проект: OAlm/the_stromberg_stories

def celebrities():
    """ Returns a list of celebrities from Wikipedia.
    """
    celebrities = set()
    w = Wikipedia(language="en")
    p = w.search("List of people on the cover of Rolling Stone", cached=True)
    s = p.plaintext()
    # Extract the links from this page, excluding links in the footnotes section,
    # or links to band names (we just want names of individuals).
    for section in p.sections:
        if section.parent and section.parent.title in ("Individuals",):
            for name in section.links:
                # Yes = * [Rob Zombie], musician and filmmaker
                # No  = * Mark White, bass guitarist for [Spin Doctors]
                if "* " + name in s:
                    celebrities.add(name)
    return celebrities

Пример #15

0

Показать файл

Файл: celebrities.py Проект: romanorac/botsvsquotes

def age(name):
    """ Returns the age of the given person.
    """
    # Use regular expression to try and parse
    # a number of date formats from Wikipedia.
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        t = DOM(p.src)
        s = plaintext(p.string)
        s = re.sub(r"\[[0-9]+\]", "", s)
        r = r"\(born ([0-9]+ [A-Za-z]+ )?([0-9]{4})\)"  # (born 1 December 2000)
        x = t(".bday")
        y = t(".dday")
        x = x[0].content if x else re.search(r, s).group(2)
        y = y[0].content if y else str(date().year)
        x = plaintext(x)
        y = plaintext(y)
        x = x.split("-")[0]  # YYYY-MM-DD
        y = y.split("-")[0]
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = ur"[A-Za-z]+ [0-9]+, ([0-9]{4})"
        r = ur"\(%s – %s\)" % (r, r)  # (May 15, 1912 – October 7, 2003)
        x = re.search(r, s).group(1)
        y = re.search(r, s).group(2)
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = r"\(aged ([0-9]+)\)"  # (aged 78)
        a = t(".infobox td:contains('aged')")
        a = a[0].content
        a = plaintext(a)
        a = re.search(r, a).group(1)
        a = int(a)
        return a
    except:
        pass
    return None

Пример #16

0

Показать файл

Файл: celebrities.py Проект: OAlm/the_stromberg_stories

def education(name, discrete=False, raw=False):
    """ Returns the education level of the given person (0.0-1.0).
    """
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        t = DOM(p.src)
      # e = the percentage of links to articles about academic titles / achievements.
        e = [t("a[href*='%s']" % x) for x in (
            "academi"    , "academy_of" , "bachelor_of" , "college"     , 
            "degree"     , "doctor"     , "emeritus"    , "engineer"    , 
            "faculty"    , "fellow"     , "genius"      , "grandmaster" , 
            "institut"   , "invent"     , "master_of"   , "mathemati"   , 
            "phd"        , "ph.d"       , "physics"     , "professor"   , 
            "school_of"  , "scien"      , "student"     , "universi"    , 
            "valedictor" , 
        )]
        e = sum(map(len, e))
        e = e / float(len(t("a")))
        if raw:
            return e
        # Convert e to a human-interpretable range (0.0-1.0),
        # based on observations in the list of p people below,
        # i.e., Pattie Maes should be > 0.9, Miley Cirus < 0.5.
        e = max(e, 0.0)
        e = min(e, 1.0)
        m = {
            0.000: 0.40,
            0.003: 0.50,
            0.010: 0.60,
            0.020: 0.70,
            0.030: 0.80,
            0.060: 0.90,
            1.000: 1.00
        }
        for x, y in zip(sorted(m), sorted(m)[1:]):
            if y > e:
                e = m[x] + (m[y] - m[x]) * (e - x) / (y - x)
                break
        # With discrete=True, returns "+" or "-".
        e = e if not discrete else ("-", "+")[e > 0.01]
        return e
    except:
        return None

Пример #17

0

Показать файл

Файл: celebrities.py Проект: OAlm/the_stromberg_stories

def age(name):
    """ Returns the age of the given person.
    """
    # Use regular expression to try and parse 
    # a number of date formats from Wikipedia.
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        t = DOM(p.src)
        s = plaintext(p.string)
        s = re.sub(r"\[[0-9]+\]", "", s)
        r = r"\(born ([0-9]+ [A-Za-z]+ )?([0-9]{4})\)" # (born 1 December 2000)
        x = t(".bday")
        y = t(".dday")
        x = x[0].content if x else re.search(r, s).group(2)
        y = y[0].content if y else str(date().year)
        x = plaintext(x)
        y = plaintext(y)
        x = x.split("-")[0] # YYYY-MM-DD
        y = y.split("-")[0]
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = ur"[A-Za-z]+ [0-9]+, ([0-9]{4})"
        r = ur"\(%s – %s\)" % (r, r) # (May 15, 1912 – October 7, 2003)
        x = re.search(r, s).group(1)
        y = re.search(r, s).group(2)
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = r"\(aged ([0-9]+)\)" # (aged 78)
        a = t(".infobox td:contains('aged')")
        a = a[0].content
        a = plaintext(a)
        a = re.search(r, a).group(1)
        a = int(a)
        return a
    except:
        pass
    return None

Пример #18

0

Показать файл

Файл: preprocessWiki.py Проект: kensk8er/MsTweetAnalysis

def crawl_wiki(model_path):
    engine = Wikipedia(license=None, throttle=1.0, language='en')
    wikis = {}
    keywords = get_keywords(model_path=model_path, threshold=0.001)
    for keyword in keywords:
        stop = False
        while stop is False:
            try:
                article = engine.search(query=keyword)
            except Exception as e:
                print str(e)
                article = None

            if type(article) is pattern.web.WikipediaArticle:
                if article.disambiguation is False:
                    print '\nretrieving', keyword, '...',
                    wikis[keyword] = {}
                    wikis[keyword]['keyword'] = keyword
                    wikis[keyword]['text'] = article.plaintext()
                    stop = True
                else:
                    print '\n[', keyword, '] leads to disambiguation page!',
                    stop = True

                    if '-' in keyword:
                        keyword = re.sub('-', ' ', keyword)  # convert hyphen into white space
                        stop = False
                    if keyword.islower() and len(keyword) <= 5:
                        keyword = keyword.upper()
                        stop = False
            else:
                print '\n[', keyword, '] doesn\'t exist on wikipedia!',
                stop = True

                if '-' in keyword:
                    keyword = re.sub('-', ' ', keyword)  # convert hyphen into white space
                    stop = False
                if keyword.islower() and len(keyword) <= 5:
                    keyword = keyword.upper()
                    stop = False

    enpickle(wikis, 'data/others/wikis.pkl')
    print '\n'
    return wikis

Пример #19

0

Показать файл

Файл: search_wiki.py Проект: byteface/sing

def run(o):

	#http://www.clips.ua.ac.be/pages/pattern-web#mail
	# should be able to do some cool stuff with the pattern libs	

	import os, sys;# sys.path.insert(0, os.path.join("..", ".."))

	from pattern.web import Wikipedia

	# This example retrieves an article from Wikipedia (http://en.wikipedia.org).
	# Wikipedia queries request the article HTML source from the server. This can be slow.
	# It is a good idea to cache results from Wikipedia locally,
	# and to set a high timeout when calling Wikipedia.search().

	engine = Wikipedia(language="en")

	# Contrary to the other search engines in the pattern.web module,
	# Wikipedia simply returns one WikipediaArticle object (or None),
	# instead of a list of results.
	article = engine.search("alice in wonderland", cached=True, timeout=30)

	print article.title               # Article title (may differ from the search query).
	print
	print article.languages["fr"]     # Article in French, can be retrieved with Wikipedia(language="fr").
	print article.links[:10], "..."   # List of linked Wikipedia articles.
	print article.external[:5], "..." # List of external URL's.
	print

	#print article.source # The full article content as HTML.
	#print article.string # The full article content, plain text with HTML tags stripped.

	# An article is made up of different sections with a title.
	# WikipediaArticle.sections is a list of WikipediaSection objects.
	# Each section has a title + content and can have a linked parent section or child sections.
	for s in article.sections:
	    print s.title.upper()
	    print 
	    print s.content # = ArticleSection.string, minus the title.
	    print

Пример #20

0

Показать файл

Файл: wikipedia_wrapper.py Проект: Innova4D/content-analysis

	def search(self, query, language='es'):
		'''
			query: string
			language: 'en' or 'es'
		'''
		wikipedia = Wikipedia(language=language)
		google_result_list = self.google.simple_search(query + ' ' + 'wikipedia')
		wikipedia_results = []
		for result in google_result_list:
			try:
				if self.url_pattern in result['link']:
					article = {}
					title = result['title'].split(' - ')[0]
					print title
					art = wikipedia.search(title)
					print art
					article['title'] = art.title
					article['text'] = art.string
					article['related'] = art.links
					wikipedia_results.append(article)
			except:
				pass
		return wikipedia_results

Пример #21

0

Показать файл

Файл: celebrities.py Проект: OAlm/the_stromberg_stories

def isa(entity, type=PERSON):
    """ Returns True if the given entity is of the given type.
    """
    # - Wikipedia.search() returns a WikipediaArticle:
    #   http://www.clips.ua.ac.be/pages/pattern-web#wikipedia
    # - The article comes with the HTML source code.
    # - The article comes with a plaintext version (no HTML tags).
    # - We can count how many times a word occurs in the plain text
    #   (e.g., articles about cities don't often use "he" or "she").
    # - We can search the HTML parse tree with CSS selectors.
    #   (e.g., the HTML often has a <div class="infobox"> with interesting metadata).
    try:
        w = Wikipedia(language="en")
        p = w.search(entity, cached=True)
        t = DOM(p.src) # HTML parse tree
        s = p.plaintext()
        s = s.lower()
        s = s.replace(".", " ")
        s = s.replace(",", " ")
        s = s.replace("'", " ")
        s = s.replace('"', " ")
        n = s.count(" ") * 1.0 or 0.0 # approximate word count
    except:
        pass
    # A person is an entity with a biography, a birthdate, and
    # a life's description containing gender-specific pronouns
    # (e.g., Noam Chomsky, Arnold Schwarzenegger).
    if type == PERSON:
        if t(".infobox.biography"):
            return True
        if t(".infobox th:contains('born')"):
            return True
        if any("early life" in x.title.lower() for x in p.sections):
            return True
        if sum(s.count(" %s " % w) for w in ( "he", "his")) / n > 0.01: # 1% he
            return True
        if sum(s.count(" %s " % w) for w in ("she", "her")) / n > 0.01: # 1% she
            return True
    # A place is an entity with a geography and/or a population
    # (e.g., New York, Jupiter, Middle Earth).
    if type == PLACE:
        if t(".infobox.geography"):
            return True
        if t(".infobox th:contains('coordinates')"):
            return True
        if t(".infobox th:contains('location')"):
            return True
        if t(".infobox th:contains('population')"):
            return True
        if t(".infobox th:contains('orbital period')"):
            return True
        if t("h2:contains('climate')"):
            return True
        if t("h2:contains('architecture')"):
            return True
        if any("geography" in x.title.lower() for x in p.sections):
            return True
        if any("flora" in x.title.lower() for x in p.sections):
            return True
        if any("fauna" in x.title.lower() for x in p.sections):
            return True
        if sum(s.count(" %s " % w) for w in (
          "city",
          "country",
          "house",
          "land",
          "location",
          "place",
          "room",
          "rooms",
          "space",
          "setting", 
          "town")) / n > 0.01:
            return True
    return False

Пример #22

0

Показать файл

Файл: Wikipedia.py Проект: valbarsau/leisuretime

def descarga(titulo):
    engine = Wikipedia(language="en")
    result= engine.search(titulo, type=SEARCH)
    return repr(plaintext(result.string))

Пример #23

0

Показать файл

Файл: wiki_data2-7.py Проект: Krolov18/Languages

def recherche_wikipedia(recherche, language):
	"""
		Fonction principale de ce script python 27 qui fait une requête
		wikipedia afin de récupérer les informations sur une page
		wikipedia.
		
		La fonction n'a formatté que les informations des recherches
		"fr" et "en" et plus précisément les sections pour "fr"
		Fiche technique, distribution, voix françaises et originales
		et en "en" le casting. Cette fonction peut évidemment être
		étoffée pour récupérer plus d'informations.
	"""
	datas = yaml.load(open("/home/krolev/Documents/Projet_media/BIBLIOTHEQUES/formats_acceptes.yaml"))
	engine = Wikipedia(language=language)
	searching = engine.search(recherche)
	Sections = searching.sections
	metadata = {}
	def fonction(part=True,sepa = ":"):
		"""
			fonction interne à recherche wikipedia qui permet de mettre
			en forme le texte récupéré pour être formatté avant le
			passage dans YAML.load()
		"""
		temp = [x.strip() for x in section.content.replace('* ','').split('\n') if x != u""]
		liste = []
		for element in temp:
			element = element.encode('utf-8')
			if part:
				(cle,sep,attr) = element.partition(sepa)
			else:
				(cle,sep,attr) = element.rpartition(sepa)
			attr = attr.strip()
			cle = cle.strip()
			if "'" in cle:
				attr = attr.replace("'","''")
			if "'" in attr:
				attr = attr.replace("'","''")
			if ":" in cle:
				cle = cle.replace(':','--')
			if ":" in attr:
				attr = attr.replace(":","--")
			element = " ".join([x for x in [cle+sep, attr] if x != '""'])
			if element_inString(element,datas["countries"]):
				element = " "+element
			elif (not ":" in element):
				element = " "+element
			liste.append(element)
		return liste
	
	if language == "fr":
		for section in Sections:
			if section.title == u"Fiche technique":
				metadata.update({"Fiche_technique":yaml.load("\n".join(fonction()[1:-1]))})
			elif section.title == u"Distribution":
				temp = fonction()
				if len(temp) != 1:
					metadata.update({"Distribution":yaml.load("\n".join(fonction(part=False)[1:-1]))})
			elif section.title == u"Voix françaises":
				metadata.update({u"Voix françaises":yaml.load('\n'.join(fonction()[1:-1]))})
			elif section.title == u"Voix originales":
				metadata.update({"Voix originales":yaml.load('\n'.join(fonction()[1:-1]))})
	if language == "en":
		for section in sections:
			if section.title == 'Cast':
				liste = []
				for element in fonction(sepa="as")[1:-1]:
					(cle, sep, val) = element.partition('as')
					element = cle+":"+val
					liste.append(element)
				metadata.update({"Casting":yaml.load('\n'.join(liste))})
	#return metadata
	return yaml.dump(metadata, default_flow_style = False, allow_unicode = True)

Пример #24

0

Показать файл

Файл: celebrities.py Проект: romanorac/botsvsquotes

def education(name, discrete=False, raw=False):
    """ Returns the education level of the given person (0.0-1.0).
    """
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        t = DOM(p.src)
        # e = the percentage of links to articles about academic titles / achievements.
        e = [
            t("a[href*='%s']" % x) for x in (
                "academi",
                "academy_of",
                "bachelor_of",
                "college",
                "degree",
                "doctor",
                "emeritus",
                "engineer",
                "faculty",
                "fellow",
                "genius",
                "grandmaster",
                "institut",
                "invent",
                "master_of",
                "mathemati",
                "phd",
                "ph.d",
                "physics",
                "professor",
                "school_of",
                "scien",
                "student",
                "universi",
                "valedictor",
            )
        ]
        e = sum(map(len, e))
        e = e / float(len(t("a")))
        if raw:
            return e
        # Convert e to a human-interpretable range (0.0-1.0),
        # based on observations in the list of p people below,
        # i.e., Pattie Maes should be > 0.9, Miley Cirus < 0.5.
        e = max(e, 0.0)
        e = min(e, 1.0)
        m = {
            0.000: 0.40,
            0.003: 0.50,
            0.010: 0.60,
            0.020: 0.70,
            0.030: 0.80,
            0.060: 0.90,
            1.000: 1.00
        }
        for x, y in zip(sorted(m), sorted(m)[1:]):
            if y > e:
                e = m[x] + (m[y] - m[x]) * (e - x) / (y - x)
                break
        # With discrete=True, returns "+" or "-".
        e = e if not discrete else ("-", "+")[e > 0.01]
        return e
    except:
        return None

Пример #25

0

Показать файл

Файл: celebrities.py Проект: romanorac/botsvsquotes

def isa(entity, type=PERSON):
    """ Returns True if the given entity is of the given type.
    """
    # - Wikipedia.search() returns a WikipediaArticle:
    #   http://www.clips.ua.ac.be/pages/pattern-web#wikipedia
    # - The article comes with the HTML source code.
    # - The article comes with a plaintext version (no HTML tags).
    # - We can count how many times a word occurs in the plain text
    #   (e.g., articles about cities don't often use "he" or "she").
    # - We can search the HTML parse tree with CSS selectors.
    #   (e.g., the HTML often has a <div class="infobox"> with interesting metadata).
    try:
        w = Wikipedia(language="en")
        p = w.search(entity, cached=True)
        t = DOM(p.src)  # HTML parse tree
        s = p.plaintext()
        s = s.lower()
        s = s.replace(".", " ")
        s = s.replace(",", " ")
        s = s.replace("'", " ")
        s = s.replace('"', " ")
        n = s.count(" ") * 1.0 or 0.0  # approximate word count
    except:
        pass
    # A person is an entity with a biography, a birthdate, and
    # a life's description containing gender-specific pronouns
    # (e.g., Noam Chomsky, Arnold Schwarzenegger).
    if type == PERSON:
        if t(".infobox.biography"):
            return True
        if t(".infobox th:contains('born')"):
            return True
        if any("early life" in x.title.lower() for x in p.sections):
            return True
        if sum(s.count(" %s " % w) for w in ("he", "his")) / n > 0.01:  # 1% he
            return True
        if sum(s.count(" %s " % w)
               for w in ("she", "her")) / n > 0.01:  # 1% she
            return True
    # A place is an entity with a geography and/or a population
    # (e.g., New York, Jupiter, Middle Earth).
    if type == PLACE:
        if t(".infobox.geography"):
            return True
        if t(".infobox th:contains('coordinates')"):
            return True
        if t(".infobox th:contains('location')"):
            return True
        if t(".infobox th:contains('population')"):
            return True
        if t(".infobox th:contains('orbital period')"):
            return True
        if t("h2:contains('climate')"):
            return True
        if t("h2:contains('architecture')"):
            return True
        if any("geography" in x.title.lower() for x in p.sections):
            return True
        if any("flora" in x.title.lower() for x in p.sections):
            return True
        if any("fauna" in x.title.lower() for x in p.sections):
            return True
        if sum(
                s.count(" %s " % w) for w in
            ("city", "country", "house", "land", "location", "place", "room",
             "rooms", "space", "setting", "town")) / n > 0.01:
            return True
    return False

Пример #26

0

Показать файл

Файл: ocaml_scraper.py Проект: rboling/data_analysis_work

output = open("ocaml_scraping_output.csv", "wb")
writer = UnicodeWriter(output)		

writer.writerow(["Article Title", "Date", "IP", "Bytes of Edit", "IP Location", "latitude", "longitude"])
	


url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'OCaml&offset=&limit=500' + '&action=history'

url = URL(url_string)

dom = DOM(url.download(cached=True))

engine = Wikipedia(license=None)

article = engine.search('Ocaml')

a = 0

while (len(dom.by_class("mw-nextlink")) > 0):
  page_history_links = dom.by_tag("ul")[0].by_tag("li")
  for link in page_history_links:
    date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') 
    ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore')  
    bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore')  
    ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true'
    req = urllib2.urlopen(ip_url)
    req_request = urllib2.Request(ip_url)
    #handler = urllib2.urlopen(req)
#    print "this must be the code in bytes\n"	
#    print req.read()

Пример #27

0

Показать файл

def recherche_wikipedia(recherche, language):
    """
		Fonction principale de ce script python 27 qui fait une requête
		wikipedia afin de récupérer les informations sur une page
		wikipedia.
		
		La fonction n'a formatté que les informations des recherches
		"fr" et "en" et plus précisément les sections pour "fr"
		Fiche technique, distribution, voix françaises et originales
		et en "en" le casting. Cette fonction peut évidemment être
		étoffée pour récupérer plus d'informations.
	"""
    datas = yaml.load(
        open(
            "/home/krolev/Documents/Projet_media/BIBLIOTHEQUES/formats_acceptes.yaml"
        ))
    engine = Wikipedia(language=language)
    searching = engine.search(recherche)
    Sections = searching.sections
    metadata = {}

    def fonction(part=True, sepa=":"):
        """
			fonction interne à recherche wikipedia qui permet de mettre
			en forme le texte récupéré pour être formatté avant le
			passage dans YAML.load()
		"""
        temp = [
            x.strip() for x in section.content.replace('* ', '').split('\n')
            if x != u""
        ]
        liste = []
        for element in temp:
            element = element.encode('utf-8')
            if part:
                (cle, sep, attr) = element.partition(sepa)
            else:
                (cle, sep, attr) = element.rpartition(sepa)
            attr = attr.strip()
            cle = cle.strip()
            if "'" in cle:
                attr = attr.replace("'", "''")
            if "'" in attr:
                attr = attr.replace("'", "''")
            if ":" in cle:
                cle = cle.replace(':', '--')
            if ":" in attr:
                attr = attr.replace(":", "--")
            element = " ".join([x for x in [cle + sep, attr] if x != '""'])
            if element_inString(element, datas["countries"]):
                element = " " + element
            elif (not ":" in element):
                element = " " + element
            liste.append(element)
        return liste

    if language == "fr":
        for section in Sections:
            if section.title == u"Fiche technique":
                metadata.update({
                    "Fiche_technique":
                    yaml.load("\n".join(fonction()[1:-1]))
                })
            elif section.title == u"Distribution":
                temp = fonction()
                if len(temp) != 1:
                    metadata.update({
                        "Distribution":
                        yaml.load("\n".join(fonction(part=False)[1:-1]))
                    })
            elif section.title == u"Voix françaises":
                metadata.update({
                    u"Voix françaises":
                    yaml.load('\n'.join(fonction()[1:-1]))
                })
            elif section.title == u"Voix originales":
                metadata.update({
                    "Voix originales":
                    yaml.load('\n'.join(fonction()[1:-1]))
                })
    if language == "en":
        for section in sections:
            if section.title == 'Cast':
                liste = []
                for element in fonction(sepa="as")[1:-1]:
                    (cle, sep, val) = element.partition('as')
                    element = cle + ":" + val
                    liste.append(element)
                metadata.update({"Casting": yaml.load('\n'.join(liste))})
    #return metadata
    return yaml.dump(metadata, default_flow_style=False, allow_unicode=True)

Пример #28

0

Показать файл

Файл: test.py Проект: rboling/data_analysis_work

output = open("new_scraping_output.csv", "wb")
writer = UnicodeWriter(output)		

writer.writerow(["Article Title", "Date", "IP", "Bytes of Edit", "IP Location", "latitude", "longitude"])
	


url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'Python_(programming_language)&offset=&limit=500' + '&action=history'

url = URL(url_string)

dom = DOM(url.download(cached=True))

engine = Wikipedia(license=None)

article = engine.search('python programming language')

a = 0

while (len(dom.by_class("mw-nextlink")) > 0):
  page_history_links = dom.by_tag("ul")[0].by_tag("li")
  for link in page_history_links:
    date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') 
    ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore')  
    bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore')  
    ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true'
    req = urllib2.urlopen(ip_url)
    req_request = urllib2.Request(ip_url)
    #handler = urllib2.urlopen(req)
#    print "this must be the code in bytes\n"	
#    print req.read()

Пример #29

0

Показать файл

Файл: ruby_scraper.py Проект: rboling/data_analysis_work

output = open("better_ruby_scraping_output.csv", "wb")
writer = UnicodeWriter(output)		

writer.writerow(["Article Title", "Date", "IP", "Bytes of Edit", "IP Location", "latitude", "longitude"])
	


url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'Ruby_(programming_language)&offset=&limit=500' + '&action=history'

url = URL(url_string)

dom = DOM(url.download(cached=True))

engine = Wikipedia(license=None)

article = engine.search('ruby programming language')

a = 0

while (len(dom.by_class("mw-nextlink")) > 0):
  page_history_links = dom.by_tag("ul")[0].by_tag("li")
  for link in page_history_links:
    date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') 
    ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore')  
    bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore')  
    ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true'
    req = urllib2.urlopen(ip_url)
    req_request = urllib2.Request(ip_url)
    #handler = urllib2.urlopen(req)
#    print "this must be the code in bytes\n"	
#    print req.read()

Пример #30

0

Показать файл

Файл: 04-wikipedia.py Проект: AnthonyNystrom/pattern

import os, sys; sys.path.insert(0, os.path.join("..", ".."))

from pattern.web import Wikipedia

# This example retrieves an article from Wikipedia (http://en.wikipedia.org).
# A query requests the article's HTML source from the server, which can be quite slow.
# It is a good idea to cache results from Wikipedia locally,
# and to set a high timeout when calling Wikipedia.search().

engine = Wikipedia(language="en")

# Contrary to other search engines in the module,
# Wikipedia simply returns one WikipediaArticle object (or None) instead of a list of results.
article = engine.search("alice in wonderland", cached=True, timeout=30)

print article.title               # Article title (may differ from the search query).
print
print article.languages["fr"]     # Article in French, can be retrieved with Wikipedia(language="fr").
print article.links[:10], "..."   # List of linked Wikipedia articles.
print article.external[:5], "..." # List of external URL's.
print

#print article.source # The full article content as HTML.
#print article.string # The full article content, plain text with HTML tags stripped.

# An article is made up of different sections with a title.
# WikipediaArticle.sections is a list of WikipediaSection objects.
# Each section has a title + content and can have a linked parent section or child sections.
for s in article.sections:
    print s.title.upper()
    print

Пример #31

0

Показать файл

Файл: luo_scraper.py Проект: rboling/data_analysis_work

output = open("Lua_scraping_output.csv", "wb")
writer = UnicodeWriter(output)		

writer.writerow(["Article Title", "Date", "IP", "Bytes of Edit", "IP Location", "latitude", "longitude"])
	


url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'Lua_(programming_language)&offset=&limit=500' + '&action=history'

url = URL(url_string)

dom = DOM(url.download(cached=True))

engine = Wikipedia(license=None)

article = engine.search('Lua Programming Language')

a = 0

while (len(dom.by_class("mw-nextlink")) > 0):
  page_history_links = dom.by_tag("ul")[0].by_tag("li")
  for link in page_history_links:
    date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') 
    ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore')  
    bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore')  
    ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true'
    req = urllib2.urlopen(ip_url)
    req_request = urllib2.Request(ip_url)
    #handler = urllib2.urlopen(req)
#    print "this must be the code in bytes\n"	
#    print req.read()

Пример #32

0

Показать файл

Файл: wikification.py Проект: mikekestemont/dbnl

    def mentions_from_backlinks(self,
                                backlinks={},
                                fresh=False,
                                filename='mentions.p',
                                context_window_size=150):
        """
        Mines backlinking pages for mentions of the page_ids in backlinks.
        Returns 5 tuples, with for each mention:
            * target_id (correct page title)
            * the name variant (inside the a-tag)
            * left context of the mention (continguous character string, with len = context_window_size)
            * right context of the mention (continguous character string, with len = context_window_size)
            * a counter of other page_ids mentioned on the page
        """
        if not backlinks:
            backlinks = self.backlinks

        # intitialize data containers:
        target_ids, name_variants, left_contexts, right_contexts, page_counts = [], [], [], [], []

        if fresh:

            logging.info(
                'Mining mentions from %d backlinking pages to %d target pages.'
                % (sum([len(v)
                        for k, v in backlinks.items()]), len(backlinks)))

            wikipedia = Wikipedia(language='nl', throttle=2)

            for idx, (page_id, links) in enumerate(backlinks.items()):

                logging.debug(
                    '\t + mining mentions of %s (%s backlinks) | page %d / %d'
                    % (page_id, len(links), idx + 1, len(backlinks)))

                for backlink in links:
                    article = wikipedia.search(backlink)
                    # skip referral pages
                    if article and not article.categories[
                            0] == 'Wikipedia:Doorverwijspagina':
                        logging.debug('\t\t* backlink: %s' % backlink)
                        # fetch the html-sections of individual sections:
                        section_sources = []
                        # article doesn't have sections
                        if not article.sections:
                            section_sources = [article.source]
                        else:
                            section_sources = [
                                section.source for section in article.sections
                            ]
                        # loop over the section sources and extract all
                        # relevant mentions:
                        for section_source in section_sources:
                            ts, nvs, lcs, rcs, cnts = self.mentions_from_section(
                                source=section_source,
                                target_id=page_id,
                                context_window_size=context_window_size)
                            if nvs:
                                target_ids.extend(ts)
                                name_variants.extend(nvs)
                                left_contexts.extend(lcs)
                                right_contexts.extend(rcs)
                                page_counts.extend(cnts)

            with open(os.path.join(self.workspace, filename), 'wb') as out:
                pickle.dump((target_ids, name_variants, left_contexts,
                             right_contexts, page_counts), out)

        else:
            with open(os.path.join(self.workspace, filename), 'rb') as inf:
                target_ids, name_variants, left_contexts, right_contexts, page_counts = pickle.load(
                    inf)

        self.mentions = (target_ids, name_variants, left_contexts,
                         right_contexts, page_counts)

Пример #33

0

Показать файл

Файл: wikification.py Проект: mikekestemont/dbnl

    def extract_unique_nes(self,
                           input_dir='frog_periodicals',
                           fresh=False,
                           max_documents=None,
                           max_words_per_doc=None,
                           filename='nes2wikilinks.p',
                           testfiles=[]):
        """
        Extracts all unique entities in the frogged files under input_dir as a dict.
        Registers in this dict: which relevant wiki-pages the NE could refer to
        according to the Wikipedia search interface.
        Only considers NEs that are:
            * capitalized
            * have len > 3 (cf. 'Van')
            * don't end in a full stop (e.g. 'A.F.Th.')
            * tagged as B-PER by Frog
        """
        if fresh:
            logging.info('Extracting NEs from documents!')
            wikipedia = Wikipedia(language='nl', throttle=3)
            self.nes2wikilinks = {}

            for filepath in glob.glob(
                    os.sep.join((self.workspace, input_dir)) + '/*.txt.out'):
                if testfiles:
                    fp = os.path.basename(filepath).replace('.txt.out', '')
                    if fp not in testfiles:
                        continue
                max_words = max_words_per_doc
                for line in codecs.open(
                        os.sep.join((self.workspace, filepath)), 'r', 'utf8'):
                    try:
                        comps = [c for c in line.strip().split('\t') if c]
                        idx, token, lemma, pos, conf, ne = comps
                        token = token.replace('_', ' ')
                        if ne.startswith('B-PER') and token[0].isupper(
                        ) and len(token) > 3 and not token.endswith('.'):
                            if token not in self.nes2wikilinks:
                                try:  # to look up the page in wikipedia:
                                    article = wikipedia.search(token)
                                    if article:  # if we find something...
                                        # we are dealing a referral page
                                        if article.categories[
                                                0] == 'Wikipedia:Doorverwijspagina':
                                            for link in article.links:
                                                if link in self.page_ids:
                                                    if token not in self.nes2wikilinks:
                                                        self.nes2wikilinks[
                                                            token] = set()
                                                    self.nes2wikilinks[
                                                        token].add(link)
                                        else:
                                            if article.title in self.page_ids:
                                                self.nes2wikilinks[
                                                    token] = set(
                                                        [article.title])
                                except:  # probably a download issue...
                                    continue
                        max_words -= 1
                        if max_words < 0:
                            break
                    # probably parsing error in the frog file
                    except ValueError:
                        continue

                # update stats:
                max_documents -= 1
                if max_documents % 10 == 0:
                    logging.info('\t+ %d documents to go' % max_documents)
                    logging.info('\t+ %d NEs collected' %
                                 len(self.nes2wikilinks))
                if max_documents < 0:
                    break

            with open(os.path.join(self.workspace, filename), 'wb') as out:
                pickle.dump(self.nes2wikilinks, out)

        else:
            with open(os.path.join(self.workspace, filename), 'rb') as inf:
                self.nes2wikilinks = pickle.load(inf)

Пример #34

0

Показать файл

Файл: 07-wikipedia.py Проект: vijaynandwani/pattern

import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.web import Wikipedia

# This example retrieves an article from Wikipedia (http://en.wikipedia.org).
# Wikipedia queries request the article HTML source from the server. This can be slow.
# It is a good idea to cache results from Wikipedia locally,
# and to set a high timeout when calling Wikipedia.search().

engine = Wikipedia(language="en")

# Contrary to the other search engines in the pattern.web module,
# Wikipedia simply returns one WikipediaArticle object (or None),
# instead of a list of results.
article = engine.search("alice in wonderland", cached=True, timeout=30)

# Article title (may differ from the search query).
print(article.title)
print()
# Article in French, can be retrieved with Wikipedia(language="fr").
print(article.languages["fr"])
print(article.links[:10], "...")  # List of linked Wikipedia articles.
print(article.external[:5], "...")  # List of external URL's.
print()

# print article.source # The full article content as HTML.
# print article.string # The full article content, plain text with HTML
# tags stripped.

# An article is made up of different sections with a title.

Пример #35

0

Показать файл

Файл: util.py Проект: anddonram/Painting4U

def getWikiAutor(nombre):
    engine = Wikipedia(license=None, throttle=5.0, language='en')
    return engine.search(nombre)

Python Wikipedia.search примеры использования