Пример #1
0
    def extract_unique_nes(self, input_dir='../workspace/frog_periodicals', fresh=False,
                            max_documents=None, max_words_per_doc=None,
                            filename='../workspace/nes2wikilinks.p'):
        """
        Extracts all unique entities in the frogged files under input_dir as a dict.
        Registers in this dict: which relevant wiki-pages the NE could refer to
        according to the Wikipedia search interface.
        Only considers NEs that are:
            * capitalized
            * have len > 3 (cf. 'Van')
            * don't end in a dot (e.g. 'A.F.Th.')
            * tagged as B-PER by Frog
        """
        if fresh:
            print('Extracting NEs from ', max_documents, 'documents!')
            wikipedia = Wikipedia(language='nl', throttle=3)
            self.nes2wikilinks = {}

            for filepath in glob.glob(input_dir+'/*.txt.out'):
                max_words = max_words_per_doc
                for line in codecs.open(filepath, 'r', 'utf8'):
                    try:
                        comps = [c for c in line.strip().split('\t') if c]
                        idx, token, lemma, pos, conf, ne  = comps
                        token = token.replace('_', ' ')
                        if ne.startswith('B-PER') and token[0].isupper() and len(token) > 3 and not token.endswith('.'):
                            if token not in self.nes2wikilinks:
                                try: # to look up the page in wikipedia:
                                    article = wikipedia.search(token)
                                    if article: # if we find something...
                                        if article.categories[0] == 'Wikipedia:Doorverwijspagina': # we are dealing a referral page
                                            for link in article.links:
                                                if link in self.page_ids:
                                                    if token not in self.nes2wikilinks:
                                                        self.nes2wikilinks[token] = set()
                                                    self.nes2wikilinks[token].add(link)
                                        else:
                                            if article.title in self.page_ids:
                                                self.nes2wikilinks[token] = set([article.title])
                                except: # probably a download issue...
                                    continue
                        max_words -= 1
                        if max_words < 0:
                            break
                    except ValueError: # probably parsing error in the frog file
                        continue

                # update stats:
                max_documents -= 1
                if max_documents % 10 == 0:
                    print('\t+ ', max_documents, 'documents to go')
                    print('\t+ ', len(self.nes2wikilinks), 'NEs collected')
                if max_documents < 0:
                    break

            pickle.dump(self.nes2wikilinks, open(filename, 'wb'))

        else:
            self.nes2wikilinks = pickle.load(open(filename, 'rb'))
def getFeaturedList():
    wiki = Wikipedia()
    article = wiki.search("Wikipedia:Featured articles")
    file = open("articalsTitle.txt", 'w')
    for section in article.sections:
        if section.string != "":
            for title in article.string.split('\n'):
                file.write(((str)(title)).strip() + "\n")
    file.close()
Пример #3
0
def testWikipedia(palabra):
    """
    Obtiene el articulo de la wikipedia
    """
    engine = Wikipedia(license=None, throttle=5.0)
    resultados = engine.search(palabra)

    print resultados.plaintext()
    return resultados
Пример #4
0
    def mentions_from_backlinks(self, backlinks={}, fresh=False, filename='../workspace/mentions.p', context_window_size=150):
        """
        Mines backlinking pages for mentions of the page_ids in backlinks.
        Returns 5 tuples, with for each mention:
            * target_id (correct page title)
            * the name variant (inside the a-tag)
            * left context of the mention (continguous character string, with len = context_window_size)
            * right context of the mention (continguous character string, with len = context_window_size)
            * a counter of other page_ids mentioned on the page
        """
        if not backlinks:
            backlinks = self.backlinks

        # intitialize data containers:
        target_ids, name_variants, left_contexts, right_contexts, page_counts = [], [], [], [], []

        if fresh:
            print('>>> Mining mentions from', sum([len(v) for k,v in backlinks.items()]),
                  'backlinking pages to', len(backlinks), 'target pages')
            print(backlinks)
            wikipedia = Wikipedia(language='nl', throttle=2)

            for idx, (page_id, links) in enumerate(backlinks.items()):
                print('\t + mining mentions of', page_id, '('+str(len(links)), 'backlinks) | page', idx+1, '/', len(backlinks))
                for backlink in links:
                    try:
                        article = wikipedia.search(backlink) # fetch the linking page via pattern
                        if not article.categories[0] == 'Wikipedia:Doorverwijspagina': # skip referral pages
                            print('\t\t* backlink:', backlink)
                            section_sources = [] # fetch the html-sections of individual sections:
                            if not article.sections: # article doesn't have sections
                                section_sources = [article.source]
                            else:
                                section_sources = [section.source for section in article.sections]
                            # loop over the section sources and extract all relevant mentions:
                            for section_source in section_sources:
                                ts, nvs, lcs, rcs, cnts = self.mentions_from_section(source=section_source,
                                                                                     target_id=page_id,
                                                                                     context_window_size=context_window_size)
                                if nvs:
                                    target_ids.extend(ts)
                                    name_variants.extend(nvs)
                                    left_contexts.extend(lcs)
                                    right_contexts.extend(rcs)
                                    page_counts.extend(cnts)
                    except:
                        continue

            pickle.dump((target_ids, name_variants, left_contexts, right_contexts, page_counts), open(filename, 'wb'))

        else:
            target_ids, name_variants, left_contexts, right_contexts, page_counts = \
                                                        pickle.load(open(filename, 'rb'))

        self.mentions = (target_ids, name_variants, left_contexts, right_contexts, page_counts)
Пример #5
0
def celebrities():
    """ Returns a list of celebrities from Wikipedia.
    """
    celebrities = set()
    w = Wikipedia(language="en")
    p = w.search("List of people on the cover of Rolling Stone", cached=True)
    s = p.plaintext()
    for section in p.sections:
        if section.parent and section.parent.title in ("Individuals", ):
            for name in section.links:
                if "* " + name in s:
                    celebrities.add(name)
    return celebrities
def getFeaturedContent():
    wiki = Wikipedia()
    list = open("articalsTitle.txt", 'r')
    file = open("wikiData.txt", 'w')
    for i in range(2000):
        title = list.readline().replace("\n", "")
        article = wiki.search(title)
        if article is not None:
            for section in article.sections:
                if section.string != "":
                    file.write(section.string + "\n")
            time.sleep(0.2)
            print(title + " Get! " + str(i) + "/2000")
Пример #7
0
def celebrities():
    """ Returns a list of celebrities from Wikipedia.
    """
    celebrities = set()
    w = Wikipedia(language="en")
    p = w.search("List of people on the cover of Rolling Stone", cached=True)
    s = p.plaintext()
    for section in p.sections:
        if section.parent and section.parent.title in ("Individuals",):
            for name in section.links:
                if "* " + name in s:
                    celebrities.add(name)
    return celebrities
Пример #8
0
def isnews(topic):
    engine = Wikipedia()
    result = engine.search(topic)
    if result:
        if topic.lower() not in result.title.lower():
            return False
        newsthings = ['places','cities','capitals','countries','people','wars']
        categories = result.categories
        for category in categories:
            for thing in newsthings:
                if thing in category.lower():
                    return True
        return False
    else:
        return False
Пример #9
0
def gender(name):
    """ Returns the gender of the given person (m/f).
    """
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        s = plaintext(p.string)
        s = s.lower()
        s = s.replace("\n", "\n ")
        m = sum(s.count(" %s " % x) for x in ( "he", "his"))
        f = sum(s.count(" %s " % y) for y in ("she", "her"))
        g = m > f and "m" or "f" # More "he" or more "she"?
        return g
    except:
        return None
Пример #10
0
def gender(name):
    """ Returns the gender of the given person (m/f).
    """
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        s = plaintext(p.string)
        s = s.lower()
        s = s.replace("\n", "\n ")
        m = sum(s.count(" %s " % x) for x in ("he", "his"))
        f = sum(s.count(" %s " % y) for y in ("she", "her"))
        g = m > f and "m" or "f"  # More "he" or more "she"?
        return g
    except:
        return None
Пример #11
0
def crawl_wiki():
    engine = Wikipedia(license=None, throttle=1.0, language='en')
    wikis = {}
    keywords = get_keywords()
    for keyword in keywords:
        stop = False
        while stop is False:
            try:
                article = engine.search(query=keyword)
            except Exception as e:
                print str(e)
                article = None

            if type(article) is pattern.web.WikipediaArticle:
                if article.disambiguation is False:
                    print '\nretrieving', keyword, '...',
                    wikis[keyword] = {}
                    wikis[keyword]['keyword'] = keyword
                    wikis[keyword]['text'] = article.plaintext()
                    stop = True
                else:
                    print '\n[', keyword, '] leads to disambiguation page!',
                    stop = True

                    if '-' in keyword:
                        keyword = re.sub(
                            '-', ' ',
                            keyword)  # convert hyphen into white space
                        stop = False
                    if keyword.islower() and len(keyword) <= 5:
                        keyword = keyword.upper()
                        stop = False
            else:
                print '\n[', keyword, '] doesn\'t exist on wikipedia!',
                stop = True

                if '-' in keyword:
                    keyword = re.sub(
                        '-', ' ', keyword)  # convert hyphen into white space
                    stop = False
                if keyword.islower() and len(keyword) <= 5:
                    keyword = keyword.upper()
                    stop = False

    enpickle(wikis, 'data/txt/wiki.pkl')
    return wikis
Пример #12
0
def celebrities():
    """ Returns a list of celebrities from Wikipedia.
    """
    celebrities = set()
    w = Wikipedia(language="en")
    p = w.search("List of people on the cover of Rolling Stone", cached=True)
    s = p.plaintext()
    # Extract the links from this page, excluding links in the footnotes section,
    # or links to band names (we just want names of individuals).
    for section in p.sections:
        if section.parent and section.parent.title in ("Individuals", ):
            for name in section.links:
                # Yes = * [Rob Zombie], musician and filmmaker
                # No  = * Mark White, bass guitarist for [Spin Doctors]
                if "* " + name in s:
                    celebrities.add(name)
    return celebrities
Пример #13
0
def isnews(topic):
    engine = Wikipedia()
    result = engine.search(topic)
    if result:
        if topic.lower() not in result.title.lower():
            return False
        newsthings = [
            'places', 'cities', 'capitals', 'countries', 'people', 'wars'
        ]
        categories = result.categories
        for category in categories:
            for thing in newsthings:
                if thing in category.lower():
                    return True
        return False
    else:
        return False
Пример #14
0
def celebrities():
    """ Returns a list of celebrities from Wikipedia.
    """
    celebrities = set()
    w = Wikipedia(language="en")
    p = w.search("List of people on the cover of Rolling Stone", cached=True)
    s = p.plaintext()
    # Extract the links from this page, excluding links in the footnotes section,
    # or links to band names (we just want names of individuals).
    for section in p.sections:
        if section.parent and section.parent.title in ("Individuals",):
            for name in section.links:
                # Yes = * [Rob Zombie], musician and filmmaker
                # No  = * Mark White, bass guitarist for [Spin Doctors]
                if "* " + name in s:
                    celebrities.add(name)
    return celebrities
Пример #15
0
def age(name):
    """ Returns the age of the given person.
    """
    # Use regular expression to try and parse
    # a number of date formats from Wikipedia.
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        t = DOM(p.src)
        s = plaintext(p.string)
        s = re.sub(r"\[[0-9]+\]", "", s)
        r = r"\(born ([0-9]+ [A-Za-z]+ )?([0-9]{4})\)"  # (born 1 December 2000)
        x = t(".bday")
        y = t(".dday")
        x = x[0].content if x else re.search(r, s).group(2)
        y = y[0].content if y else str(date().year)
        x = plaintext(x)
        y = plaintext(y)
        x = x.split("-")[0]  # YYYY-MM-DD
        y = y.split("-")[0]
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = ur"[A-Za-z]+ [0-9]+, ([0-9]{4})"
        r = ur"\(%s – %s\)" % (r, r)  # (May 15, 1912 – October 7, 2003)
        x = re.search(r, s).group(1)
        y = re.search(r, s).group(2)
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = r"\(aged ([0-9]+)\)"  # (aged 78)
        a = t(".infobox td:contains('aged')")
        a = a[0].content
        a = plaintext(a)
        a = re.search(r, a).group(1)
        a = int(a)
        return a
    except:
        pass
    return None
Пример #16
0
def education(name, discrete=False, raw=False):
    """ Returns the education level of the given person (0.0-1.0).
    """
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        t = DOM(p.src)
      # e = the percentage of links to articles about academic titles / achievements.
        e = [t("a[href*='%s']" % x) for x in (
            "academi"    , "academy_of" , "bachelor_of" , "college"     , 
            "degree"     , "doctor"     , "emeritus"    , "engineer"    , 
            "faculty"    , "fellow"     , "genius"      , "grandmaster" , 
            "institut"   , "invent"     , "master_of"   , "mathemati"   , 
            "phd"        , "ph.d"       , "physics"     , "professor"   , 
            "school_of"  , "scien"      , "student"     , "universi"    , 
            "valedictor" , 
        )]
        e = sum(map(len, e))
        e = e / float(len(t("a")))
        if raw:
            return e
        # Convert e to a human-interpretable range (0.0-1.0),
        # based on observations in the list of p people below,
        # i.e., Pattie Maes should be > 0.9, Miley Cirus < 0.5.
        e = max(e, 0.0)
        e = min(e, 1.0)
        m = {
            0.000: 0.40,
            0.003: 0.50,
            0.010: 0.60,
            0.020: 0.70,
            0.030: 0.80,
            0.060: 0.90,
            1.000: 1.00
        }
        for x, y in zip(sorted(m), sorted(m)[1:]):
            if y > e:
                e = m[x] + (m[y] - m[x]) * (e - x) / (y - x)
                break
        # With discrete=True, returns "+" or "-".
        e = e if not discrete else ("-", "+")[e > 0.01]
        return e
    except:
        return None
Пример #17
0
def age(name):
    """ Returns the age of the given person.
    """
    # Use regular expression to try and parse 
    # a number of date formats from Wikipedia.
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        t = DOM(p.src)
        s = plaintext(p.string)
        s = re.sub(r"\[[0-9]+\]", "", s)
        r = r"\(born ([0-9]+ [A-Za-z]+ )?([0-9]{4})\)" # (born 1 December 2000)
        x = t(".bday")
        y = t(".dday")
        x = x[0].content if x else re.search(r, s).group(2)
        y = y[0].content if y else str(date().year)
        x = plaintext(x)
        y = plaintext(y)
        x = x.split("-")[0] # YYYY-MM-DD
        y = y.split("-")[0]
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = ur"[A-Za-z]+ [0-9]+, ([0-9]{4})"
        r = ur"\(%s – %s\)" % (r, r) # (May 15, 1912 – October 7, 2003)
        x = re.search(r, s).group(1)
        y = re.search(r, s).group(2)
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = r"\(aged ([0-9]+)\)" # (aged 78)
        a = t(".infobox td:contains('aged')")
        a = a[0].content
        a = plaintext(a)
        a = re.search(r, a).group(1)
        a = int(a)
        return a
    except:
        pass
    return None
Пример #18
0
def crawl_wiki(model_path):
    engine = Wikipedia(license=None, throttle=1.0, language='en')
    wikis = {}
    keywords = get_keywords(model_path=model_path, threshold=0.001)
    for keyword in keywords:
        stop = False
        while stop is False:
            try:
                article = engine.search(query=keyword)
            except Exception as e:
                print str(e)
                article = None

            if type(article) is pattern.web.WikipediaArticle:
                if article.disambiguation is False:
                    print '\nretrieving', keyword, '...',
                    wikis[keyword] = {}
                    wikis[keyword]['keyword'] = keyword
                    wikis[keyword]['text'] = article.plaintext()
                    stop = True
                else:
                    print '\n[', keyword, '] leads to disambiguation page!',
                    stop = True

                    if '-' in keyword:
                        keyword = re.sub('-', ' ', keyword)  # convert hyphen into white space
                        stop = False
                    if keyword.islower() and len(keyword) <= 5:
                        keyword = keyword.upper()
                        stop = False
            else:
                print '\n[', keyword, '] doesn\'t exist on wikipedia!',
                stop = True

                if '-' in keyword:
                    keyword = re.sub('-', ' ', keyword)  # convert hyphen into white space
                    stop = False
                if keyword.islower() and len(keyword) <= 5:
                    keyword = keyword.upper()
                    stop = False

    enpickle(wikis, 'data/others/wikis.pkl')
    print '\n'
    return wikis
Пример #19
0
def run(o):

	#http://www.clips.ua.ac.be/pages/pattern-web#mail
	# should be able to do some cool stuff with the pattern libs	

	import os, sys;# sys.path.insert(0, os.path.join("..", ".."))

	from pattern.web import Wikipedia

	# This example retrieves an article from Wikipedia (http://en.wikipedia.org).
	# Wikipedia queries request the article HTML source from the server. This can be slow.
	# It is a good idea to cache results from Wikipedia locally,
	# and to set a high timeout when calling Wikipedia.search().

	engine = Wikipedia(language="en")

	# Contrary to the other search engines in the pattern.web module,
	# Wikipedia simply returns one WikipediaArticle object (or None),
	# instead of a list of results.
	article = engine.search("alice in wonderland", cached=True, timeout=30)

	print article.title               # Article title (may differ from the search query).
	print
	print article.languages["fr"]     # Article in French, can be retrieved with Wikipedia(language="fr").
	print article.links[:10], "..."   # List of linked Wikipedia articles.
	print article.external[:5], "..." # List of external URL's.
	print

	#print article.source # The full article content as HTML.
	#print article.string # The full article content, plain text with HTML tags stripped.

	# An article is made up of different sections with a title.
	# WikipediaArticle.sections is a list of WikipediaSection objects.
	# Each section has a title + content and can have a linked parent section or child sections.
	for s in article.sections:
	    print s.title.upper()
	    print 
	    print s.content # = ArticleSection.string, minus the title.
	    print
	    
Пример #20
0
	def search(self, query, language='es'):
		'''
			query: string
			language: 'en' or 'es'
		'''
		wikipedia = Wikipedia(language=language)
		google_result_list = self.google.simple_search(query + ' ' + 'wikipedia')
		wikipedia_results = []
		for result in google_result_list:
			try:
				if self.url_pattern in result['link']:
					article = {}
					title = result['title'].split(' - ')[0]
					print title
					art = wikipedia.search(title)
					print art
					article['title'] = art.title
					article['text'] = art.string
					article['related'] = art.links
					wikipedia_results.append(article)
			except:
				pass
		return wikipedia_results
Пример #21
0
def isa(entity, type=PERSON):
    """ Returns True if the given entity is of the given type.
    """
    # - Wikipedia.search() returns a WikipediaArticle:
    #   http://www.clips.ua.ac.be/pages/pattern-web#wikipedia
    # - The article comes with the HTML source code.
    # - The article comes with a plaintext version (no HTML tags).
    # - We can count how many times a word occurs in the plain text
    #   (e.g., articles about cities don't often use "he" or "she").
    # - We can search the HTML parse tree with CSS selectors.
    #   (e.g., the HTML often has a <div class="infobox"> with interesting metadata).
    try:
        w = Wikipedia(language="en")
        p = w.search(entity, cached=True)
        t = DOM(p.src) # HTML parse tree
        s = p.plaintext()
        s = s.lower()
        s = s.replace(".", " ")
        s = s.replace(",", " ")
        s = s.replace("'", " ")
        s = s.replace('"', " ")
        n = s.count(" ") * 1.0 or 0.0 # approximate word count
    except:
        pass
    # A person is an entity with a biography, a birthdate, and
    # a life's description containing gender-specific pronouns
    # (e.g., Noam Chomsky, Arnold Schwarzenegger).
    if type == PERSON:
        if t(".infobox.biography"):
            return True
        if t(".infobox th:contains('born')"):
            return True
        if any("early life" in x.title.lower() for x in p.sections):
            return True
        if sum(s.count(" %s " % w) for w in ( "he", "his")) / n > 0.01: # 1% he
            return True
        if sum(s.count(" %s " % w) for w in ("she", "her")) / n > 0.01: # 1% she
            return True
    # A place is an entity with a geography and/or a population
    # (e.g., New York, Jupiter, Middle Earth).
    if type == PLACE:
        if t(".infobox.geography"):
            return True
        if t(".infobox th:contains('coordinates')"):
            return True
        if t(".infobox th:contains('location')"):
            return True
        if t(".infobox th:contains('population')"):
            return True
        if t(".infobox th:contains('orbital period')"):
            return True
        if t("h2:contains('climate')"):
            return True
        if t("h2:contains('architecture')"):
            return True
        if any("geography" in x.title.lower() for x in p.sections):
            return True
        if any("flora" in x.title.lower() for x in p.sections):
            return True
        if any("fauna" in x.title.lower() for x in p.sections):
            return True
        if sum(s.count(" %s " % w) for w in (
          "city",
          "country",
          "house",
          "land",
          "location",
          "place",
          "room",
          "rooms",
          "space",
          "setting", 
          "town")) / n > 0.01:
            return True
    return False
Пример #22
0
def descarga(titulo):
    engine = Wikipedia(language="en")
    result= engine.search(titulo, type=SEARCH)
    return repr(plaintext(result.string))
Пример #23
0
def recherche_wikipedia(recherche, language):
	"""
		Fonction principale de ce script python 27 qui fait une requête
		wikipedia afin de récupérer les informations sur une page
		wikipedia.
		
		La fonction n'a formatté que les informations des recherches
		"fr" et "en" et plus précisément les sections pour "fr"
		Fiche technique, distribution, voix françaises et originales
		et en "en" le casting. Cette fonction peut évidemment être
		étoffée pour récupérer plus d'informations.
	"""
	datas = yaml.load(open("/home/krolev/Documents/Projet_media/BIBLIOTHEQUES/formats_acceptes.yaml"))
	engine = Wikipedia(language=language)
	searching = engine.search(recherche)
	Sections = searching.sections
	metadata = {}
	def fonction(part=True,sepa = ":"):
		"""
			fonction interne à recherche wikipedia qui permet de mettre
			en forme le texte récupéré pour être formatté avant le
			passage dans YAML.load()
		"""
		temp = [x.strip() for x in section.content.replace('* ','').split('\n') if x != u""]
		liste = []
		for element in temp:
			element = element.encode('utf-8')
			if part:
				(cle,sep,attr) = element.partition(sepa)
			else:
				(cle,sep,attr) = element.rpartition(sepa)
			attr = attr.strip()
			cle = cle.strip()
			if "'" in cle:
				attr = attr.replace("'","''")
			if "'" in attr:
				attr = attr.replace("'","''")
			if ":" in cle:
				cle = cle.replace(':','--')
			if ":" in attr:
				attr = attr.replace(":","--")
			element = " ".join([x for x in [cle+sep, attr] if x != '""'])
			if element_inString(element,datas["countries"]):
				element = " "+element
			elif (not ":" in element):
				element = " "+element
			liste.append(element)
		return liste
	
	if language == "fr":
		for section in Sections:
			if section.title == u"Fiche technique":
				metadata.update({"Fiche_technique":yaml.load("\n".join(fonction()[1:-1]))})
			elif section.title == u"Distribution":
				temp = fonction()
				if len(temp) != 1:
					metadata.update({"Distribution":yaml.load("\n".join(fonction(part=False)[1:-1]))})
			elif section.title == u"Voix françaises":
				metadata.update({u"Voix françaises":yaml.load('\n'.join(fonction()[1:-1]))})
			elif section.title == u"Voix originales":
				metadata.update({"Voix originales":yaml.load('\n'.join(fonction()[1:-1]))})
	if language == "en":
		for section in sections:
			if section.title == 'Cast':
				liste = []
				for element in fonction(sepa="as")[1:-1]:
					(cle, sep, val) = element.partition('as')
					element = cle+":"+val
					liste.append(element)
				metadata.update({"Casting":yaml.load('\n'.join(liste))})
	#return metadata
	return yaml.dump(metadata, default_flow_style = False, allow_unicode = True)
Пример #24
0
def education(name, discrete=False, raw=False):
    """ Returns the education level of the given person (0.0-1.0).
    """
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        t = DOM(p.src)
        # e = the percentage of links to articles about academic titles / achievements.
        e = [
            t("a[href*='%s']" % x) for x in (
                "academi",
                "academy_of",
                "bachelor_of",
                "college",
                "degree",
                "doctor",
                "emeritus",
                "engineer",
                "faculty",
                "fellow",
                "genius",
                "grandmaster",
                "institut",
                "invent",
                "master_of",
                "mathemati",
                "phd",
                "ph.d",
                "physics",
                "professor",
                "school_of",
                "scien",
                "student",
                "universi",
                "valedictor",
            )
        ]
        e = sum(map(len, e))
        e = e / float(len(t("a")))
        if raw:
            return e
        # Convert e to a human-interpretable range (0.0-1.0),
        # based on observations in the list of p people below,
        # i.e., Pattie Maes should be > 0.9, Miley Cirus < 0.5.
        e = max(e, 0.0)
        e = min(e, 1.0)
        m = {
            0.000: 0.40,
            0.003: 0.50,
            0.010: 0.60,
            0.020: 0.70,
            0.030: 0.80,
            0.060: 0.90,
            1.000: 1.00
        }
        for x, y in zip(sorted(m), sorted(m)[1:]):
            if y > e:
                e = m[x] + (m[y] - m[x]) * (e - x) / (y - x)
                break
        # With discrete=True, returns "+" or "-".
        e = e if not discrete else ("-", "+")[e > 0.01]
        return e
    except:
        return None
Пример #25
0
def isa(entity, type=PERSON):
    """ Returns True if the given entity is of the given type.
    """
    # - Wikipedia.search() returns a WikipediaArticle:
    #   http://www.clips.ua.ac.be/pages/pattern-web#wikipedia
    # - The article comes with the HTML source code.
    # - The article comes with a plaintext version (no HTML tags).
    # - We can count how many times a word occurs in the plain text
    #   (e.g., articles about cities don't often use "he" or "she").
    # - We can search the HTML parse tree with CSS selectors.
    #   (e.g., the HTML often has a <div class="infobox"> with interesting metadata).
    try:
        w = Wikipedia(language="en")
        p = w.search(entity, cached=True)
        t = DOM(p.src)  # HTML parse tree
        s = p.plaintext()
        s = s.lower()
        s = s.replace(".", " ")
        s = s.replace(",", " ")
        s = s.replace("'", " ")
        s = s.replace('"', " ")
        n = s.count(" ") * 1.0 or 0.0  # approximate word count
    except:
        pass
    # A person is an entity with a biography, a birthdate, and
    # a life's description containing gender-specific pronouns
    # (e.g., Noam Chomsky, Arnold Schwarzenegger).
    if type == PERSON:
        if t(".infobox.biography"):
            return True
        if t(".infobox th:contains('born')"):
            return True
        if any("early life" in x.title.lower() for x in p.sections):
            return True
        if sum(s.count(" %s " % w) for w in ("he", "his")) / n > 0.01:  # 1% he
            return True
        if sum(s.count(" %s " % w)
               for w in ("she", "her")) / n > 0.01:  # 1% she
            return True
    # A place is an entity with a geography and/or a population
    # (e.g., New York, Jupiter, Middle Earth).
    if type == PLACE:
        if t(".infobox.geography"):
            return True
        if t(".infobox th:contains('coordinates')"):
            return True
        if t(".infobox th:contains('location')"):
            return True
        if t(".infobox th:contains('population')"):
            return True
        if t(".infobox th:contains('orbital period')"):
            return True
        if t("h2:contains('climate')"):
            return True
        if t("h2:contains('architecture')"):
            return True
        if any("geography" in x.title.lower() for x in p.sections):
            return True
        if any("flora" in x.title.lower() for x in p.sections):
            return True
        if any("fauna" in x.title.lower() for x in p.sections):
            return True
        if sum(
                s.count(" %s " % w) for w in
            ("city", "country", "house", "land", "location", "place", "room",
             "rooms", "space", "setting", "town")) / n > 0.01:
            return True
    return False
Пример #26
0
output = open("ocaml_scraping_output.csv", "wb")
writer = UnicodeWriter(output)		

writer.writerow(["Article Title", "Date", "IP", "Bytes of Edit", "IP Location", "latitude", "longitude"])
	


url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'OCaml&offset=&limit=500' + '&action=history'

url = URL(url_string)

dom = DOM(url.download(cached=True))

engine = Wikipedia(license=None)

article = engine.search('Ocaml')

a = 0

while (len(dom.by_class("mw-nextlink")) > 0):
  page_history_links = dom.by_tag("ul")[0].by_tag("li")
  for link in page_history_links:
    date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') 
    ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore')  
    bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore')  
    ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true'
    req = urllib2.urlopen(ip_url)
    req_request = urllib2.Request(ip_url)
    #handler = urllib2.urlopen(req)
#    print "this must be the code in bytes\n"	
#    print req.read()
Пример #27
0
def recherche_wikipedia(recherche, language):
    """
		Fonction principale de ce script python 27 qui fait une requête
		wikipedia afin de récupérer les informations sur une page
		wikipedia.
		
		La fonction n'a formatté que les informations des recherches
		"fr" et "en" et plus précisément les sections pour "fr"
		Fiche technique, distribution, voix françaises et originales
		et en "en" le casting. Cette fonction peut évidemment être
		étoffée pour récupérer plus d'informations.
	"""
    datas = yaml.load(
        open(
            "/home/krolev/Documents/Projet_media/BIBLIOTHEQUES/formats_acceptes.yaml"
        ))
    engine = Wikipedia(language=language)
    searching = engine.search(recherche)
    Sections = searching.sections
    metadata = {}

    def fonction(part=True, sepa=":"):
        """
			fonction interne à recherche wikipedia qui permet de mettre
			en forme le texte récupéré pour être formatté avant le
			passage dans YAML.load()
		"""
        temp = [
            x.strip() for x in section.content.replace('* ', '').split('\n')
            if x != u""
        ]
        liste = []
        for element in temp:
            element = element.encode('utf-8')
            if part:
                (cle, sep, attr) = element.partition(sepa)
            else:
                (cle, sep, attr) = element.rpartition(sepa)
            attr = attr.strip()
            cle = cle.strip()
            if "'" in cle:
                attr = attr.replace("'", "''")
            if "'" in attr:
                attr = attr.replace("'", "''")
            if ":" in cle:
                cle = cle.replace(':', '--')
            if ":" in attr:
                attr = attr.replace(":", "--")
            element = " ".join([x for x in [cle + sep, attr] if x != '""'])
            if element_inString(element, datas["countries"]):
                element = " " + element
            elif (not ":" in element):
                element = " " + element
            liste.append(element)
        return liste

    if language == "fr":
        for section in Sections:
            if section.title == u"Fiche technique":
                metadata.update({
                    "Fiche_technique":
                    yaml.load("\n".join(fonction()[1:-1]))
                })
            elif section.title == u"Distribution":
                temp = fonction()
                if len(temp) != 1:
                    metadata.update({
                        "Distribution":
                        yaml.load("\n".join(fonction(part=False)[1:-1]))
                    })
            elif section.title == u"Voix françaises":
                metadata.update({
                    u"Voix françaises":
                    yaml.load('\n'.join(fonction()[1:-1]))
                })
            elif section.title == u"Voix originales":
                metadata.update({
                    "Voix originales":
                    yaml.load('\n'.join(fonction()[1:-1]))
                })
    if language == "en":
        for section in sections:
            if section.title == 'Cast':
                liste = []
                for element in fonction(sepa="as")[1:-1]:
                    (cle, sep, val) = element.partition('as')
                    element = cle + ":" + val
                    liste.append(element)
                metadata.update({"Casting": yaml.load('\n'.join(liste))})
    #return metadata
    return yaml.dump(metadata, default_flow_style=False, allow_unicode=True)
Пример #28
0
output = open("new_scraping_output.csv", "wb")
writer = UnicodeWriter(output)		

writer.writerow(["Article Title", "Date", "IP", "Bytes of Edit", "IP Location", "latitude", "longitude"])
	


url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'Python_(programming_language)&offset=&limit=500' + '&action=history'

url = URL(url_string)

dom = DOM(url.download(cached=True))

engine = Wikipedia(license=None)

article = engine.search('python programming language')

a = 0

while (len(dom.by_class("mw-nextlink")) > 0):
  page_history_links = dom.by_tag("ul")[0].by_tag("li")
  for link in page_history_links:
    date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') 
    ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore')  
    bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore')  
    ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true'
    req = urllib2.urlopen(ip_url)
    req_request = urllib2.Request(ip_url)
    #handler = urllib2.urlopen(req)
#    print "this must be the code in bytes\n"	
#    print req.read()
Пример #29
0
output = open("better_ruby_scraping_output.csv", "wb")
writer = UnicodeWriter(output)		

writer.writerow(["Article Title", "Date", "IP", "Bytes of Edit", "IP Location", "latitude", "longitude"])
	


url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'Ruby_(programming_language)&offset=&limit=500' + '&action=history'

url = URL(url_string)

dom = DOM(url.download(cached=True))

engine = Wikipedia(license=None)

article = engine.search('ruby programming language')

a = 0

while (len(dom.by_class("mw-nextlink")) > 0):
  page_history_links = dom.by_tag("ul")[0].by_tag("li")
  for link in page_history_links:
    date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') 
    ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore')  
    bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore')  
    ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true'
    req = urllib2.urlopen(ip_url)
    req_request = urllib2.Request(ip_url)
    #handler = urllib2.urlopen(req)
#    print "this must be the code in bytes\n"	
#    print req.read()
Пример #30
0
import os, sys; sys.path.insert(0, os.path.join("..", ".."))

from pattern.web import Wikipedia

# This example retrieves an article from Wikipedia (http://en.wikipedia.org).
# A query requests the article's HTML source from the server, which can be quite slow.
# It is a good idea to cache results from Wikipedia locally,
# and to set a high timeout when calling Wikipedia.search().

engine = Wikipedia(language="en")

# Contrary to other search engines in the module,
# Wikipedia simply returns one WikipediaArticle object (or None) instead of a list of results.
article = engine.search("alice in wonderland", cached=True, timeout=30)

print article.title               # Article title (may differ from the search query).
print
print article.languages["fr"]     # Article in French, can be retrieved with Wikipedia(language="fr").
print article.links[:10], "..."   # List of linked Wikipedia articles.
print article.external[:5], "..." # List of external URL's.
print

#print article.source # The full article content as HTML.
#print article.string # The full article content, plain text with HTML tags stripped.

# An article is made up of different sections with a title.
# WikipediaArticle.sections is a list of WikipediaSection objects.
# Each section has a title + content and can have a linked parent section or child sections.
for s in article.sections:
    print s.title.upper()
    print 
Пример #31
0
output = open("Lua_scraping_output.csv", "wb")
writer = UnicodeWriter(output)		

writer.writerow(["Article Title", "Date", "IP", "Bytes of Edit", "IP Location", "latitude", "longitude"])
	


url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'Lua_(programming_language)&offset=&limit=500' + '&action=history'

url = URL(url_string)

dom = DOM(url.download(cached=True))

engine = Wikipedia(license=None)

article = engine.search('Lua Programming Language')

a = 0

while (len(dom.by_class("mw-nextlink")) > 0):
  page_history_links = dom.by_tag("ul")[0].by_tag("li")
  for link in page_history_links:
    date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') 
    ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore')  
    bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore')  
    ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true'
    req = urllib2.urlopen(ip_url)
    req_request = urllib2.Request(ip_url)
    #handler = urllib2.urlopen(req)
#    print "this must be the code in bytes\n"	
#    print req.read()
Пример #32
0
    def mentions_from_backlinks(self,
                                backlinks={},
                                fresh=False,
                                filename='mentions.p',
                                context_window_size=150):
        """
        Mines backlinking pages for mentions of the page_ids in backlinks.
        Returns 5 tuples, with for each mention:
            * target_id (correct page title)
            * the name variant (inside the a-tag)
            * left context of the mention (continguous character string, with len = context_window_size)
            * right context of the mention (continguous character string, with len = context_window_size)
            * a counter of other page_ids mentioned on the page
        """
        if not backlinks:
            backlinks = self.backlinks

        # intitialize data containers:
        target_ids, name_variants, left_contexts, right_contexts, page_counts = [], [], [], [], []

        if fresh:

            logging.info(
                'Mining mentions from %d backlinking pages to %d target pages.'
                % (sum([len(v)
                        for k, v in backlinks.items()]), len(backlinks)))

            wikipedia = Wikipedia(language='nl', throttle=2)

            for idx, (page_id, links) in enumerate(backlinks.items()):

                logging.debug(
                    '\t + mining mentions of %s (%s backlinks) | page %d / %d'
                    % (page_id, len(links), idx + 1, len(backlinks)))

                for backlink in links:
                    article = wikipedia.search(backlink)
                    # skip referral pages
                    if article and not article.categories[
                            0] == 'Wikipedia:Doorverwijspagina':
                        logging.debug('\t\t* backlink: %s' % backlink)
                        # fetch the html-sections of individual sections:
                        section_sources = []
                        # article doesn't have sections
                        if not article.sections:
                            section_sources = [article.source]
                        else:
                            section_sources = [
                                section.source for section in article.sections
                            ]
                        # loop over the section sources and extract all
                        # relevant mentions:
                        for section_source in section_sources:
                            ts, nvs, lcs, rcs, cnts = self.mentions_from_section(
                                source=section_source,
                                target_id=page_id,
                                context_window_size=context_window_size)
                            if nvs:
                                target_ids.extend(ts)
                                name_variants.extend(nvs)
                                left_contexts.extend(lcs)
                                right_contexts.extend(rcs)
                                page_counts.extend(cnts)

            with open(os.path.join(self.workspace, filename), 'wb') as out:
                pickle.dump((target_ids, name_variants, left_contexts,
                             right_contexts, page_counts), out)

        else:
            with open(os.path.join(self.workspace, filename), 'rb') as inf:
                target_ids, name_variants, left_contexts, right_contexts, page_counts = pickle.load(
                    inf)

        self.mentions = (target_ids, name_variants, left_contexts,
                         right_contexts, page_counts)
Пример #33
0
    def extract_unique_nes(self,
                           input_dir='frog_periodicals',
                           fresh=False,
                           max_documents=None,
                           max_words_per_doc=None,
                           filename='nes2wikilinks.p',
                           testfiles=[]):
        """
        Extracts all unique entities in the frogged files under input_dir as a dict.
        Registers in this dict: which relevant wiki-pages the NE could refer to
        according to the Wikipedia search interface.
        Only considers NEs that are:
            * capitalized
            * have len > 3 (cf. 'Van')
            * don't end in a full stop (e.g. 'A.F.Th.')
            * tagged as B-PER by Frog
        """
        if fresh:
            logging.info('Extracting NEs from documents!')
            wikipedia = Wikipedia(language='nl', throttle=3)
            self.nes2wikilinks = {}

            for filepath in glob.glob(
                    os.sep.join((self.workspace, input_dir)) + '/*.txt.out'):
                if testfiles:
                    fp = os.path.basename(filepath).replace('.txt.out', '')
                    if fp not in testfiles:
                        continue
                max_words = max_words_per_doc
                for line in codecs.open(
                        os.sep.join((self.workspace, filepath)), 'r', 'utf8'):
                    try:
                        comps = [c for c in line.strip().split('\t') if c]
                        idx, token, lemma, pos, conf, ne = comps
                        token = token.replace('_', ' ')
                        if ne.startswith('B-PER') and token[0].isupper(
                        ) and len(token) > 3 and not token.endswith('.'):
                            if token not in self.nes2wikilinks:
                                try:  # to look up the page in wikipedia:
                                    article = wikipedia.search(token)
                                    if article:  # if we find something...
                                        # we are dealing a referral page
                                        if article.categories[
                                                0] == 'Wikipedia:Doorverwijspagina':
                                            for link in article.links:
                                                if link in self.page_ids:
                                                    if token not in self.nes2wikilinks:
                                                        self.nes2wikilinks[
                                                            token] = set()
                                                    self.nes2wikilinks[
                                                        token].add(link)
                                        else:
                                            if article.title in self.page_ids:
                                                self.nes2wikilinks[
                                                    token] = set(
                                                        [article.title])
                                except:  # probably a download issue...
                                    continue
                        max_words -= 1
                        if max_words < 0:
                            break
                    # probably parsing error in the frog file
                    except ValueError:
                        continue

                # update stats:
                max_documents -= 1
                if max_documents % 10 == 0:
                    logging.info('\t+ %d documents to go' % max_documents)
                    logging.info('\t+ %d NEs collected' %
                                 len(self.nes2wikilinks))
                if max_documents < 0:
                    break

            with open(os.path.join(self.workspace, filename), 'wb') as out:
                pickle.dump(self.nes2wikilinks, out)

        else:
            with open(os.path.join(self.workspace, filename), 'rb') as inf:
                self.nes2wikilinks = pickle.load(inf)
Пример #34
0
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.web import Wikipedia

# This example retrieves an article from Wikipedia (http://en.wikipedia.org).
# Wikipedia queries request the article HTML source from the server. This can be slow.
# It is a good idea to cache results from Wikipedia locally,
# and to set a high timeout when calling Wikipedia.search().

engine = Wikipedia(language="en")

# Contrary to the other search engines in the pattern.web module,
# Wikipedia simply returns one WikipediaArticle object (or None),
# instead of a list of results.
article = engine.search("alice in wonderland", cached=True, timeout=30)

# Article title (may differ from the search query).
print(article.title)
print()
# Article in French, can be retrieved with Wikipedia(language="fr").
print(article.languages["fr"])
print(article.links[:10], "...")  # List of linked Wikipedia articles.
print(article.external[:5], "...")  # List of external URL's.
print()

# print article.source # The full article content as HTML.
# print article.string # The full article content, plain text with HTML
# tags stripped.

# An article is made up of different sections with a title.
Пример #35
0
def getWikiAutor(nombre):
    engine = Wikipedia(license=None, throttle=5.0, language='en')
    return engine.search(nombre)