示例#1
0
def getWikipediaPage(namedEntity, returnAlist=True):
	'''
	Scraps Wikipedia's page and catches the intro of the article
	(so it can, later, be passed throught the Open Information Extractors).
	'''
	paragraphTextList = []
	wikidataUrl = None

	#get the article name by disambiguating
	articleName = namedEntity
	#articleName = getTheRightSearchQuery(namedEntity, wikiOrGoogleOriented='w')

	if articleName != None:
		articleNameNoSpace = articleName.replace(u' ', u'_')
		#we try to transform the article name to an uri readeable string if it has an iri code in it
		articleNameNoSpace = utils.iriToUri(articleNameNoSpace)
	#if we don't find the right querry we return None
	else:
		return None

	#get the article url
	articleUrl = 'https://en.wikipedia.org/wiki/%s' %(articleNameNoSpace)
	
	try:
		#prepare a beautiful soup
		articleObject = urllib2.urlopen(articleUrl.encode('utf8'))
	#if there is an http error it means the page has an entry but doesn't exist so we return None
	except HTTPError:
		return None

	articleSoup = BeautifulSoup(articleObject.read(), 'lxml', from_encoding=articleObject.info().getparam(u'charset'))
	#get the text inside al paragraphs
	paragraphList = articleSoup.body.findAll(u'p')

	#we only take the text from the paragraphs, not the tags
	for paragraph in paragraphList:
		paragraphText = paragraph.text
		#we return None if we get to a desambiguation page
		if u'may refer to:' in paragraphText:
			return None
		#we clean the text form all wikipedia references
		cleanedParagraphText = re.sub(ur'\[[\d]+\]', u'', paragraphText)
		paragraphTextList.append(cleanedParagraphText)
	
	#we scrap the wikidata url from wikipedia
	wikidataRow = articleSoup.body.find(u'a', {u'title' : u'Link to connected data repository item [g]'})
	if wikidataRow != None:
		wikidataUrl = wikidataRow.attrs[u'href']
	

	#we return the intro, the article name and the wikidata url
	if returnAlist == True:
		return paragraphTextList, articleNameNoSpace, wikidataUrl
	else:
		return u' '.join(paragraphTextList), articleNameNoSpace, wikidataUrl
示例#2
0
def getGoogleSuggestedQueries(namedEntity):
	'''
	If we get no result for the search, it searchs 
	for google's suggestions containing the NE and
	returns a suggested query and a list with the
	other suggestions.
	If there's still nothing it returns None
	'''
	#we transform the named entity to utf8
	namedEntity = utils.toUtf8(namedEntity)

	#we try to transform the named entity to an uri readeable string if it has an iri code in it
	namedEntity = utils.iriToUri(namedEntity)

	#we try to make the adress bar
	regexWords = re.compile(ur'[\w]+', re.UNICODE)
	listOfNeWords = regexWords.findall(namedEntity)

	googleSuggestionsUrl = u'http://suggestqueries.google.com/complete/search?output=toolbar&q='

	#we make a string corresponding to 2nd half of the search bar
	for indexNeWord, neWord in enumerate(listOfNeWords):

		#we try to make the named entity word readeable for the adress bar
		neWord = utils.toUtf8(neWord)
		
		#if it's the first word of the query
		if indexNeWord == 0:
			googleSuggestionsUrl += neWord
		#if it's not the first word
		else:
			googleSuggestionsUrl += u'%20' + neWord

	suggestionsPage = urllib2.urlopen(googleSuggestionsUrl)
	suggestionSoup = BeautifulSoup(suggestionsPage.read(), 'lxml')
		
	#we scrap the data from the suggestion webpage
	tagList = suggestionSoup.findAll(u'suggestion')
	googleSuggestedQueriesList = []
	for tag in tagList:
		googleSuggestedQueriesList.append(tag[u'data'])

	if len(googleSuggestedQueriesList) != 0:
		return googleSuggestedQueriesList
	else:
		return None
示例#3
0
def firstElementOfDisambiguationPage(articleSoup, repository, lang):
	'''
	When the article is a disambiguation page, we use a regex to 
	extract the default first description of the selected language
	'''
	descriptionSnakviews = articleSoup.body.findAll(u'div', {u'class' : u'wikibase-snakview-value wikibase-snakview-variation-valuesnak'})
	for snak in descriptionSnakviews:
		descriptionLink = snak.find(u'a')
		try: 
			descriptionLinkString = (descriptionLink.string).lower()
			#we try to find the first suggestion by deducing it's string won't have 
			#the words 'disambiguation' and 'wikipedia' in them
			if u'disambiguation' not in descriptionLinkString and u'wikipedia' not in descriptionLinkString:
				#we extract the code
				hrefAttribute = re.compile('href="/wiki/|title="|"', re.UNICODE)
				firstDisambiguationCode = hrefAttribute.split((descriptionLink).encode('utf8'))[1]
				
				#we prepare the new pywikibot objects
				item = pywikibot.ItemPage(repository, firstDisambiguationCode)
				#we need to call the item (by using '.get()') to access the data
				item.get()

				#get the url
				wikidataUrl = item.full_url()
				#we try to transform the url to an uri readeable string if it has an iri code in it
				wikidataUrl = utils.iriToUri(wikidataUrl)
				
				#prepare a beautiful soup
				articleObject = urllib2.urlopen(wikidataUrl)
				articleSoup = BeautifulSoup(articleObject.read(), 'lxml', from_encoding=articleObject.info().getparam(u'charset'))
				
				#we define the variable description again
				try:
					descriptions = item.descriptions
					langDescription = descriptions[lang]
				except KeyError:
					langDescription = None
				break
			else:
				langDescription = None
		#if we don't find the description link, then we pass
		except AttributeError:
			pass
	return langDescription, item
示例#4
0
def getGoogleFirstPages(query, nbOfPages=10, includeWikipedia=True):
	"""
	Returns a list containing the text in the first google suggested 
	pages.
	Minimum number of pages: 1
	Maximum number of pages: 10

	Tuto: http://stackoverflow.com/questions/37754771/python-get-result-of-google-search

	IN ENGLISH ONLY
	For other languages the script must be changed at the lines:
		- result = service.cse().list(q=query, cx=my_cse_id, excludeTerms='wikipedia.org', lr='lang_en').execute()
		- searchPage = utils.iriToUri(u'https://www.google.ca/search?q=%s&lr=lang_en' %(query.replace(u' ', u'+')))
		- searchPage = utils.iriToUri(u'https://www.google.ca/search?q=%s+-site%3Awikipedia.org&lr=lang_en' %(query.replace(u' ', u'+')))
		- searchPage = utils.iriToUri(u'https://www.google.ca/search?q=%s&lr=lang_en' %(query.replace(u' ', u'+')))

	"""
	resultContentList = []

	#we try to transform the query to an uri readeable string if it has an iri code in it
	query = utils.iriToUri(query)

	#google api information
	my_cse_id = u'010977947046578922081:vl9apgc5fic'
	api_key = open('apiKeyKnowledgeGraphAndCustomSearch.api_key').read()
	service = build('customsearch', 'v1', developerKey=api_key)
	#google search result (dict)
	try:
		if includeWikipedia != True:
			result = service.cse().list(q=query, cx=my_cse_id, excludeTerms='wikipedia.org', lr='lang_en').execute()
		else:
			result = service.cse().list(q=query, cx=my_cse_id, lr='lang_en').execute()
	
	#if we reach the 100/day limit of the google search api, we scrap the google search page using beautifulsoup
	except HttpError:
		#do we include wikipedia pages or not
		if includeWikipedia != True:
			searchPage = utils.iriToUri(u'https://www.google.ca/search?q=%s+-site%3Awikipedia.org&lr=lang_en' %(query.replace(u' ', u'+')))
		else:
			searchPage = utils.iriToUri(u'https://www.google.ca/search?q=%s&lr=lang_en' %(query.replace(u' ', u'+')))

		#prepare a beautiful soup
		searchPageRequest = urllib2.Request(searchPage, headers={u'User-Agent' : u"Magic Browser"})
		searchPageObject = urllib2.urlopen(searchPageRequest)
		searchPageSoup = BeautifulSoup(searchPageObject.read(), 'lxml', from_encoding=searchPageObject.info().getparam(u'charset'))

		#get the pages urls of the search
		searchResultsWrapList = (searchPageSoup.body).findAll(u'h3', {u'class' : u'r'})
		#and we save them in a list
		result = []

		for h3Tag in searchResultsWrapList:
			googleLinkInfo = (h3Tag.find(u'a', {u'href':True})[u'href'])
			splittedLinkInfo = (re.compile(ur'=|&')).split(googleLinkInfo)
			if splittedLinkInfo[0] == u'/url?q':
				url = splittedLinkInfo[1]
				result.append(url)

	#getting the urls from the first google search results (using the google API)
	for indexResult in range(nbOfPages*2):
		try:
			#catching the url
			#if we used the google API
			if type(result) is not list:
				searchResult = result[u'items'][indexResult]
				googleSearchUrl = searchResult[u'link']
			#if we didn't use the API but beautifulsoup
			else:
				googleSearchUrl = result[indexResult]
			#we try to transform the url to an uri readeable string if it has an iri code in it
			googleSearchUrl = utils.iriToUri(googleSearchUrl)

			#scrapping each search result content
			#prepare a beautiful soup
			pageRequest = urllib2.Request(googleSearchUrl, headers={'User-Agent' : "Magic Browser"})
			pageObject = urllib2.urlopen(pageRequest)
			encoding = pageObject.info().getparam(u'charset')
			#if there is no encoding specified, we suppose it's probably utf-8
			if encoding in [None, u'None', 'None', 'none', '']:
				encoding = u'utf-8'
			pageSoup = BeautifulSoup(pageObject.read(), 'lxml', from_encoding=encoding)
			#getting the content out of the url pages
			pageText = pageSoup.body.text

			#we make sure the page is in english
			if langid.classify(pageText)[0] == u'en':

				#cleaning and filtering the text
				newPageText = ''				
				pageLinesList = pageText.split(u'\n')

				for indexLine in range(len(pageLinesList)):
					pageLine = pageLinesList[indexLine]
					lineChars = len(pageLine)
					#the input file has a limit of characters per line o 4096 char
					#so we don't take into account the lines that are bigger than 4090
					if lineChars > 4090:
						pass
					#ollie seems to have a data limit,
					#we have discarded nb of line limit, nb of char limit, time limit, data weight of input.txt limit, 
					#so we have limited the size of lines taken into account (we only take the bigger than 250char)
					elif lineChars < 250:
						pass
					elif u'{' in pageLine or u'}' in pageLine:
						pass
					elif pageLine in [u'', u' ']:
						pass
					else:
						newPageText = newPageText + pageLine.replace(u'\t', '') + u'\n'

				resultContentList.append(newPageText)

				#if we achieve the desired n umber of pages we break the loop
				if len(resultContentList) >= nbOfPages:
					break
		#in case the desired nb of pages exceeds the desired number or is less than 1
		except IndexError:
			pass
		#if we cannot open the page using urllib, we pass
		except HTTPError:
			pass
		#if the page content or its body content is None we pass
		except AttributeError:
			pass
		#error: [Errno 104] Connection reset by peer or <urlopen error [Errno 110] Connection timed out>
		except Exception:
			pass

	#returning the list of text
	return resultContentList


#--------------END-FONCTIONS-------------------
#--------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------



#--------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------
#--------------START-DECLARATIONS-------------------








#--------------END-DECLARATIONS-------------------
#--------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------

#--------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------
#--------------START-COMMANDES-------------------

	

#--------------END-COMMANDES-------------------
#--------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------

#--------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------
#--------------START-ERRORS-------------------



#--------------END-ERRORS-------------------
#--------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------

#--------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------
#--------------START-REFERENCES-------------------



#--------------END-REFERENCES-------------------
#--------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------
示例#5
0
def getIntroWikipedia(namedEntity, returnAlist=True):
	'''
	Scraps Wikipedia's page and catches the intro of the article
	(so it can, later, be passed throught the Open Information Extractors).
	'''
	paragraphIntroList = []
	wikidataUrl = None

	#get the article name by disambiguating
	articleName = namedEntity
	#articleName = getTheRightSearchQuery(namedEntity, wikiOrGoogleOriented='w')

	if articleName != None:
		articleNameNoSpace = articleName.replace(u' ', u'_')
		#we try to transform the article name to an uri readeable string if it has an iri code in it
		articleNameNoSpace = utils.iriToUri(articleNameNoSpace)
	#if we don't find the right querry we return None
	else:
		return None

	#get the article url
	articleUrl = 'https://en.wikipedia.org/wiki/%s' %(articleNameNoSpace)
	
	try:
		#prepare a beautiful soup
		articleObject = urllib2.urlopen(articleUrl.encode('utf8'))
	#if there is an http error it means the page has an entry but doesn't exist so we return None
	except HTTPError:
		return None

	articleSoup = BeautifulSoup(articleObject.read(), 'lxml', from_encoding=articleObject.info().getparam(u'charset'))
	#get the first section-separated content
	articleContentDiv = articleSoup.body.find(u'div', {u'id' : u'toc'})

	#if there is no toc section (introduction), we search for the text appearing before the h2 tag (sub-title)
	if articleContentDiv == None:
		articleContentDiv = articleSoup.body.find(u'h2')

	#if it's an incomplete page we return None
	if articleContentDiv != None:
		#get the content of the previous paragraphs (aka the intro)
		articleIntroList = articleContentDiv.findAllPrevious(u'p')
		#cleaning the html intro list obtained
		wikiReference = re.compile(ur'\[[\d]*\]|\[.*needed\]|\[not verified.*\]|\[note.*\]|\(.*listen\)')
		for introParagraph in articleIntroList:
			introParagraphText = introParagraph.text
			#if its a disambiguation page, we return None
			if len(articleIntroList) <= 2 and u'refer to:' in introParagraphText:
				return None
			#if it's not a disambiguation page
			else:
				if len(introParagraphText) != 0:
					introReferenceList = re.findall(wikiReference, introParagraphText)
					#if there are references
					if len(introReferenceList) != 0:
						for reference in introReferenceList:
							introParagraphText = introParagraphText.replace(reference, u'')
						cleanedParagraphText = introParagraphText
					#if there are no references
					else:
						cleanedParagraphText = introParagraphText
					#add to the list
					paragraphIntroList.append(cleanedParagraphText)
		#we scrap the wikidata url from wikipedia
		wikidataRow = articleSoup.body.find(u'a', {u'title' : u'Link to connected data repository item [g]'})
		if wikidataRow != None:
			wikidataUrl = wikidataRow.attrs[u'href']
		
		#if the page doesn't exist we return None
		else:
			return None
	#if it's an incomplete page we return None
	else:
		return None

	#we put the list in the order it appears in wikipedia
	paragraphIntroList.reverse()
	#we return the intro, the article name and the wikidata url
	if returnAlist == True:
		return paragraphIntroList, articleNameNoSpace, wikidataUrl
	else:
		return u' '.join(paragraphIntroList), articleNameNoSpace, wikidataUrl
示例#6
0
def getGoogleKnowledgeGraph(query, queueKnowGraph=None):
    """
	Returns a dictionary containing the google
	knowledge graph information.
	#https://developers.google.com/apis-explorer/#p/kgsearch/v1/
	"""
    dictGoogleKnowGraph = {}
    api_key = open('apiKeyKnowledgeGraphAndCustomSearch.api_key').read()
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'

    #we try to make the query readeable for the adress bar
    query = utils.toUtf8(query)

    #tramsform all iri code in the query to uri readeable
    query = utils.iriToUri(query)

    params = {
        'query': query,
        'limit': 10,
        'indent': True,
        'key': api_key,
    }

    parameters = (urllib.urlencode(params))
    url = service_url + '?' + parameters
    response = json.loads(urllib.urlopen(url).read())

    try:
        #if we find an empty knowledge graph we pass, the dict will be empty and we will return None
        if len(response[u'itemListElement']) == 0:
            pass
        else:
            #possible entities in order of probability score
            bestResultElement = response[u'itemListElement'][0]
            elementContentDict = bestResultElement[u'result']
            #content of the most probable entity
            for elementContentIndex in range(len(elementContentDict)):
                elementContentKey = elementContentDict.keys(
                )[elementContentIndex]
                elementContent = elementContentDict[elementContentKey]

                #we discard all keys with the word image in them
                if u'image' not in elementContentKey:
                    #we treat the information differently depending on
                    #the type of content information
                    elementContentType = type(elementContent)

                    #if it's unicode we add directly to the dict
                    if elementContentType is unicode:
                        dictGoogleKnowGraph[u'%s000000.%s' %
                                            (str(elementContentIndex).zfill(3),
                                             elementContentKey.replace(
                                                 u'@', u''))] = elementContent
                    #if it's a list every sub-element will be saved under the same key name
                    #(thought the number differs)
                    elif elementContentType is list:
                        for subElementIndex, subElement in enumerate(
                                elementContent):

                            dictGoogleKnowGraph[
                                u'%s%s000.%s' %
                                (str(elementContentIndex).zfill(3),
                                 str(subElementIndex).zfill(3),
                                 elementContentKey.replace(u'@',
                                                           u''))] = subElement
                    #if it's a dict we save the sub-element corresponding to unicode or a list
                    elif elementContentType is dict:
                        for subElementIndex in range(len(elementContent)):
                            subElementKey = elementContent.keys(
                            )[subElementIndex]
                            subElement = elementContent[subElementKey]

                            subElementType = type(subElement)
                            if subElementType is unicode:
                                dictGoogleKnowGraph[
                                    u'%s000%s.%s.%s' %
                                    (str(elementContentIndex).zfill(3),
                                     str(subElementIndex).zfill(3),
                                     elementContentKey.replace(u'@', u''),
                                     subElementKey.replace(u'@',
                                                           u''))] = subElement
                            else:
                                raise TypeError('unexpected type of value',
                                                subElementType)
                    else:
                        raise TypeError('unexpected type of value',
                                        elementContentType)
        #complete the know graph dict the old fashion way: scraping
        dictGoogleKnowGraph = completeGoogleKnowledgeGraph(
            dictGoogleKnowGraph, query)

    #if the page we found is not a knowledge graph page we pass, the dict will be empty and we will return None
    except KeyError:
        pass

    #if we reach the 100/day limit of the google search api, we scrap the google search page using beautifulsoup
    except HttpError:
        #we try to fill the know graph dict the old fashion way: scraping
        dictGoogleKnowGraph = completeGoogleKnowledgeGraph(
            dictGoogleKnowGraph, query)

    #if the dict is empty we return None
    if len(dictGoogleKnowGraph) == 0:
        if queueKnowGraph != None:
            queueKnowGraph.put(None)
        return None

    #saving in queue and returning the dict
    if queueKnowGraph != None:
        queueKnowGraph.put(dictGoogleKnowGraph)
    return dictGoogleKnowGraph


#--------------END-DECLARATIONS-------------------
#--------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------

#--------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------
#--------------START-COMMANDES-------------------

#--------------END-COMMANDES-------------------
#--------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------

#--------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------
#--------------START-ERRORS-------------------

#--------------END-ERRORS-------------------
#--------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------

#--------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------
#--------------START-REFERENCES-------------------

#--------------END-REFERENCES-------------------
#--------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------
示例#7
0
def completeGoogleKnowledgeGraph(dictGoogleKnowGraph, query):
    '''
	We try to extract more information from the google search page
	to try and complete the knowledge graph dict
	'''
    searchPage = utils.iriToUri(
        u'https://www.google.ca/search?q=%s&lr=lang_en' %
        (query.replace(u' ', u'+')))

    #first we reformat all the dates already in the dict except for the ones in the description
    for keyKG in dictGoogleKnowGraph:
        #if it's not the entity description
        if u'articleBody' not in keyKG:
            dictGoogleKnowGraph[keyKG] = utils.reformatDates(
                dictGoogleKnowGraph[keyKG])

    #prepare a beautiful soup
    searchPageRequest = urllib2.Request(
        searchPage, headers={u'User-Agent': u"Magic Browser"})
    searchPageObject = urllib2.urlopen(searchPageRequest)
    searchPageSoup = BeautifulSoup(
        searchPageObject.read(),
        'lxml',
        from_encoding=searchPageObject.info().getparam(u'charset'))

    #get the pages urls of the search
    searchKnowGraphInfobox = (searchPageSoup.body).findAll(
        u'div', {u'class': u'_o0d'})
    #for each type of fact
    for indexDiv, div in enumerate(searchKnowGraphInfobox):
        divText = div.text
        #catching the name and type
        if indexDiv == 0:
            divChildren = div.findChildren()[1:]
            #we fill the dict if the name and/or type info is missing
            if len(divChildren) == 1 and divChildren[
                    0].text not in dictGoogleKnowGraph.values():
                dictGoogleKnowGraph[u'%s000000.%s' %
                                    (str(indexDiv).zfill(3),
                                     u'name')] = divChildren[0].text
            elif len(divChildren) == 2:
                if divChildren[0].text not in dictGoogleKnowGraph.values():
                    dictGoogleKnowGraph[u'000000000.%s' %
                                        (u'name')] = divChildren[0].text
                if divChildren[1].text not in dictGoogleKnowGraph.values():
                    dictGoogleKnowGraph[u'000000001.%s' %
                                        (u'type')] = divChildren[1].text
            else:
                pass
        #catching other infos
        else:
            knowGraphDateFormat = re.compile(
                ur'((.+): (January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) [\d]?[\d], [\d]*)|((Born:|Died:) [\d]+)'
            )
            #catching the description
            if divText[-10:len(divText)] == u' Wikipedia':
                descriptionText = divText[:-10]
                if descriptionText not in dictGoogleKnowGraph.values():
                    dictGoogleKnowGraph[
                        u'%s000000.%s' %
                        (str(indexDiv).zfill(3),
                         u'detailedDescription.articleBody')] = descriptionText
            #catching and modifying the format of dates (except in the description)
            elif knowGraphDateFormat.match(divText) != None:
                divText = utils.reformatDates(divText)
                #we catch the first column so we can separate the relation of the fact
                firstColumn = divText.index(u':')
                relFact = divText[:firstColumn]
                #we save to the dict
                #if there is a coma after the date, we separate the 2 informations
                if u', ' in divText:
                    firstComa = divText.index(u',')
                    if divText[(firstColumn +
                                2):] not in dictGoogleKnowGraph.values():
                        if divText[
                            (firstColumn +
                             2):firstComa] not in dictGoogleKnowGraph.values():
                            #date
                            dictGoogleKnowGraph[u'%s000000.%s' %
                                                (str(indexDiv).zfill(3),
                                                 relFact)] = divText[(
                                                     firstColumn +
                                                     2):firstComa]
                        if divText[(firstComa +
                                    2):] not in dictGoogleKnowGraph.values():
                            #place
                            dictGoogleKnowGraph[u'%s000001.%s' %
                                                (str(indexDiv).zfill(3),
                                                 relFact)] = divText[(
                                                     firstComa + 2):]
                else:
                    if divText[(firstColumn +
                                2):] not in dictGoogleKnowGraph.values():
                        #date
                        dictGoogleKnowGraph[u'%s000000.%s' %
                                            (str(indexDiv).zfill(3),
                                             relFact)] = divText[(firstColumn +
                                                                  2):]
            #catching the tables
            elif len(div.findAll(u'table')) != 0:
                tableName = div.find(u'div', {
                    u'class': re.compile(ur'.*')
                }).text
                #we catch the info in the table row by row, cell by cell
                tableData = []
                tableBody = div.find('tbody')
                rows = tableBody.findAll('tr')
                for row in rows:
                    cols = row.findAll('td')
                    cols = [ele.text.strip() for ele in cols]
                    tableData.append([ele for ele in cols
                                      if ele])  # Get rid of empty values
                #we save to the dict
                for indexData, data in enumerate(tableData):
                    #if it's a simple table (one column), otherwise it's too complex to add to the dict
                    if len(data) == 1:
                        dictGoogleKnowGraph[u'%s%s000.%s' %
                                            (str(indexDiv).zfill(3),
                                             str(indexData).zfill(3),
                                             tableName)] = data[0]
            #catching any other info
            elif len(divText) > 0 and u': ' in divText:
                divText = utils.reformatDates(divText)
                firstColumn = divText.index(u':')
                relFact = divText[:firstColumn]
                dictGoogleKnowGraph[u'%s000000.%s' %
                                    (str(indexDiv).zfill(3),
                                     relFact)] = divText[(firstColumn + 2):]
            #empty data
            else:
                #print(div)
                pass
    return dictGoogleKnowGraph
示例#8
0
def getInfoWikidata(namedEntity,
                    queueWikiData=None,
                    lang='en',
                    noDisambiguationSolving=True):
    '''
	Returns a dictionary containing all the wikidata
	data in an easy human readeable way
	otherwise: None.
	'''
    #get the right search query name by disambiguating
    searchQuery = namedEntity
    ######################################
    ###searchQuery = getTheRightSearchQuery(namedEntity, wikiOrGoogleOriented='w')
    try:
        print(searchQuery)
    except UnicodeEncodeError:
        pass
    #prepare the pywikibot objects
    site = pywikibot.Site(lang, 'wikipedia')
    repository = site.data_repository()
    page = pywikibot.Page(site, searchQuery)

    try:
        item = pywikibot.ItemPage.fromPage(page)
        #we need to call the item (by using '.get()') to access the data
        item.get()

        #get the url
        wikidataUrl = item.full_url()
        #we try to transform the url to an uri readeable string if it has an iri code in it
        wikidataUrl = utils.iriToUri(wikidataUrl)

        #prepare a beautiful soup
        articleObject = urllib2.urlopen(wikidataUrl)
        articleSoup = BeautifulSoup(
            articleObject.read(),
            'lxml',
            from_encoding=articleObject.info().getparam(u'charset'))
        #we save the pywikibot ojects in a list
        pywikibotObjList = [repository, item, articleSoup]

    #if we encounter a KeyError: u'upperBound' due to the ± character
    except KeyError:
        #defining beautiful soup objects
        pageUrl = page.full_url()
        #we try to transform the url to an uri readeable string if it has an iri code in it
        pageUrl = utils.iriToUri(pageUrl)

        articleObject = urllib2.urlopen(pageUrl)
        articleSoup = BeautifulSoup(
            articleObject.read(),
            'lxml',
            from_encoding=articleObject.info().getparam(u'charset'))
        #catching the page url
        itemUrl = articleSoup.find(
            u'a',
            {u'title': u'Link to connected data repository item [g]'})[u'href']
        #we save the pywikibot oject (item url) in a list
        pywikibotObjList = [itemUrl]
    #if no page corresponds to the search
    except pywikibot.NoPage:
        print('ERROR no page', namedEntity)
        #saving in queue and returning the dict
        if queueWikiData != None:
            queueWikiData.put(None)
        #we return an empty dict
        return None
    #if it takes too much time for the API or the API breaks we return an empty dict
    except APIError:
        print('API ERROR maxlag', namedEntity)
        #saving in queue and returning the dict
        if queueWikiData != None:
            queueWikiData.put(None)
        #we return an empty dict
        return None
    #we get the information in dict form
    dictInfoWkdata = getInfoWikidataFromNet(namedEntity, queueWikiData, lang,
                                            pywikibotObjList,
                                            noDisambiguationSolving)

    #if the dict is empty we return None
    if len(dictInfoWkdata) == 0:
        if queueWikiData != None:
            queueWikiData.put(None)
        return None

    return dictInfoWkdata
示例#9
0
def getInfoWkdataWithBtfulSoup(itemUrl,
                               dictInfoWkdata={},
                               allInfo=True,
                               lang=u'en'):
    '''
	Scraps a wikidata site info using beautiful soup and returns a dict
	IF we do not scrap all the info, then we obtain the requested info, not a dict
	If we want to scrap some specific information instead 
	#of all the page information, we need to specify it
	by replacing the allInfo argument with the type of 
	information we're looking for:
	'l'	 'label'	 :: the string label of the article
	'd'	 'description'	   :: the string description of the article
	'a'	 'aliases'	   :: the list of aliases of the article
	'c'	 'claims'		:: the list of claims of the article
	'''
    #we try to transform the url to an uri readeable string if it has an iri code in it
    itemUrl = utils.iriToUri(itemUrl)

    articleObject = urllib2.urlopen(itemUrl)
    articleSoup = BeautifulSoup(
        articleObject.read(),
        'lxml',
        from_encoding=articleObject.info().getparam(u'charset'))

    #information contained in tables
    articleTable = articleSoup.body.findAll('table')
    for tableContent in articleTable:
        #LABEL
        articleLabel = (tableContent.find(
            u'span', {u'class': u'wikibase-labelview-text'})).string
        dictInfoWkdata[u'000000000.Label'] = articleLabel
        #specific info return
        if allInfo != True:
            if allInfo.lower() in [u'label', u'lab', u'l']:
                return articleLabel

        #DESCRIPTION
        articleDescription = (tableContent.find(
            u'span', {u'class': u'wikibase-descriptionview-text'})).string
        dictInfoWkdata[u'001000000.Description'] = articleDescription
        #specific info return
        if allInfo != True:
            if allInfo.lower() in [u'description', u'descrip', u'd']:
                return articleDescription

        #ALIAS
        articleAliasesList = tableContent.findAll(
            u'li', {u'class': u'wikibase-aliasesview-list-item'})
        aliasesList = []

        #saving each alias
        for indexL in range(len(articleAliasesList)):
            articleAlias = (articleAliasesList[indexL]).string
            dictInfoWkdata[u'002%s000.Aliases' %
                           (str(indexL).zfill(3))] = articleAlias

            aliasesList.append(articleAlias)

        #specific info return
        if allInfo != True:
            if allInfo.lower() in [u'aliases', u'alias', u'a']:
                return aliasesList

    #information otherwise located
    #CLAIMS
    articleClaims = articleSoup.body.findAll(
        u'div', {u'class': u'wikibase-statementgrouplistview'})
    claimList = []
    oldKeyName = u''

    for indexXl, claimSection in enumerate(articleClaims):
        claimTypeList = claimSection.findAll(
            u'div', {u'class': u'wikibase-statementgroupview'})
        #information divided by type
        for indexL, claimType in enumerate(claimTypeList):
            #capturing the label
            claimLabel = claimType.find(
                u'div',
                {u'class': u'wikibase-statementgroupview-property-label'})
            for labelContainer in claimLabel:
                keyName = (labelContainer.string)
            #capturing the informations
            claimContent = claimType.findAll(
                u'div', {u'class': u'wikibase-snakview-value-container'})

            langKeyNameList = []
            keyNameList = []

            for indexM, contentContainer in enumerate(claimContent):
                contentValues = contentContainer.findAll(
                    u'div', {
                        u'class':
                        u'wikibase-snakview-value wikibase-snakview-variation-valuesnak'
                    })
                #capturing each row of information for each type of information
                for contentRow in contentValues:
                    #if the row contains information divided by language, we only select the info corresponding to the concerning language
                    if (contentRow.find(u'span', {u'lang': lang})) != None:
                        contentStringsList = contentRow.strings
                        langKeyNameList.append(keyName)
                    #if the language we want isn't in the right language, we pass
                    elif (contentRow.find(
                            u'span',
                        {u'lang': re.compile(ur'[a-z]*')})) != None:
                        contentStringsList = None
                        langKeyNameList.append(keyName)
                    #if the info is a appendice of a language-separated info
                    elif keyName in langKeyNameList:
                        pass
                    #if the info is new and non language-divided
                    else:
                        contentStringsList = contentRow.strings

                    #if we have an info to add to the dict, we add it
                    if contentStringsList != None:
                        value = u''
                        for cntntString in contentStringsList:
                            value += unicode(cntntString)
                        #save in dict
                        keyInfo = u'%s%s%s.%s' % (
                            str(indexXl + 900).zfill(3), str(indexL).zfill(3),
                            str(indexM).zfill(3), keyName)
                        #if the key does not exist yet
                        if keyInfo not in dictInfoWkdata:
                            #if the value is not empty and the key + value is not already in the dict
                            if value != u'' and keyName not in keyNameList and value not in dictInfoWkdata.values(
                            ):
                                wordsRe = re.compile(ur'[\w]+', re.UNICODE)
                                valueList = wordsRe.findall(value.lower())
                                #avoiding wikipedia references and images
                                if u'wikipedia' not in valueList and u'jpg' not in valueList and u'gif' not in valueList:
                                    dictInfoWkdata[keyInfo] = value
                                    claimList.append(value)
                                    keyNameList.append(keyName)
                                else:
                                    pass