def getWikipediaPage(namedEntity, returnAlist=True): ''' Scraps Wikipedia's page and catches the intro of the article (so it can, later, be passed throught the Open Information Extractors). ''' paragraphTextList = [] wikidataUrl = None #get the article name by disambiguating articleName = namedEntity #articleName = getTheRightSearchQuery(namedEntity, wikiOrGoogleOriented='w') if articleName != None: articleNameNoSpace = articleName.replace(u' ', u'_') #we try to transform the article name to an uri readeable string if it has an iri code in it articleNameNoSpace = utils.iriToUri(articleNameNoSpace) #if we don't find the right querry we return None else: return None #get the article url articleUrl = 'https://en.wikipedia.org/wiki/%s' %(articleNameNoSpace) try: #prepare a beautiful soup articleObject = urllib2.urlopen(articleUrl.encode('utf8')) #if there is an http error it means the page has an entry but doesn't exist so we return None except HTTPError: return None articleSoup = BeautifulSoup(articleObject.read(), 'lxml', from_encoding=articleObject.info().getparam(u'charset')) #get the text inside al paragraphs paragraphList = articleSoup.body.findAll(u'p') #we only take the text from the paragraphs, not the tags for paragraph in paragraphList: paragraphText = paragraph.text #we return None if we get to a desambiguation page if u'may refer to:' in paragraphText: return None #we clean the text form all wikipedia references cleanedParagraphText = re.sub(ur'\[[\d]+\]', u'', paragraphText) paragraphTextList.append(cleanedParagraphText) #we scrap the wikidata url from wikipedia wikidataRow = articleSoup.body.find(u'a', {u'title' : u'Link to connected data repository item [g]'}) if wikidataRow != None: wikidataUrl = wikidataRow.attrs[u'href'] #we return the intro, the article name and the wikidata url if returnAlist == True: return paragraphTextList, articleNameNoSpace, wikidataUrl else: return u' '.join(paragraphTextList), articleNameNoSpace, wikidataUrl
def getGoogleSuggestedQueries(namedEntity): ''' If we get no result for the search, it searchs for google's suggestions containing the NE and returns a suggested query and a list with the other suggestions. If there's still nothing it returns None ''' #we transform the named entity to utf8 namedEntity = utils.toUtf8(namedEntity) #we try to transform the named entity to an uri readeable string if it has an iri code in it namedEntity = utils.iriToUri(namedEntity) #we try to make the adress bar regexWords = re.compile(ur'[\w]+', re.UNICODE) listOfNeWords = regexWords.findall(namedEntity) googleSuggestionsUrl = u'http://suggestqueries.google.com/complete/search?output=toolbar&q=' #we make a string corresponding to 2nd half of the search bar for indexNeWord, neWord in enumerate(listOfNeWords): #we try to make the named entity word readeable for the adress bar neWord = utils.toUtf8(neWord) #if it's the first word of the query if indexNeWord == 0: googleSuggestionsUrl += neWord #if it's not the first word else: googleSuggestionsUrl += u'%20' + neWord suggestionsPage = urllib2.urlopen(googleSuggestionsUrl) suggestionSoup = BeautifulSoup(suggestionsPage.read(), 'lxml') #we scrap the data from the suggestion webpage tagList = suggestionSoup.findAll(u'suggestion') googleSuggestedQueriesList = [] for tag in tagList: googleSuggestedQueriesList.append(tag[u'data']) if len(googleSuggestedQueriesList) != 0: return googleSuggestedQueriesList else: return None
def firstElementOfDisambiguationPage(articleSoup, repository, lang): ''' When the article is a disambiguation page, we use a regex to extract the default first description of the selected language ''' descriptionSnakviews = articleSoup.body.findAll(u'div', {u'class' : u'wikibase-snakview-value wikibase-snakview-variation-valuesnak'}) for snak in descriptionSnakviews: descriptionLink = snak.find(u'a') try: descriptionLinkString = (descriptionLink.string).lower() #we try to find the first suggestion by deducing it's string won't have #the words 'disambiguation' and 'wikipedia' in them if u'disambiguation' not in descriptionLinkString and u'wikipedia' not in descriptionLinkString: #we extract the code hrefAttribute = re.compile('href="/wiki/|title="|"', re.UNICODE) firstDisambiguationCode = hrefAttribute.split((descriptionLink).encode('utf8'))[1] #we prepare the new pywikibot objects item = pywikibot.ItemPage(repository, firstDisambiguationCode) #we need to call the item (by using '.get()') to access the data item.get() #get the url wikidataUrl = item.full_url() #we try to transform the url to an uri readeable string if it has an iri code in it wikidataUrl = utils.iriToUri(wikidataUrl) #prepare a beautiful soup articleObject = urllib2.urlopen(wikidataUrl) articleSoup = BeautifulSoup(articleObject.read(), 'lxml', from_encoding=articleObject.info().getparam(u'charset')) #we define the variable description again try: descriptions = item.descriptions langDescription = descriptions[lang] except KeyError: langDescription = None break else: langDescription = None #if we don't find the description link, then we pass except AttributeError: pass return langDescription, item
def getGoogleFirstPages(query, nbOfPages=10, includeWikipedia=True): """ Returns a list containing the text in the first google suggested pages. Minimum number of pages: 1 Maximum number of pages: 10 Tuto: http://stackoverflow.com/questions/37754771/python-get-result-of-google-search IN ENGLISH ONLY For other languages the script must be changed at the lines: - result = service.cse().list(q=query, cx=my_cse_id, excludeTerms='wikipedia.org', lr='lang_en').execute() - searchPage = utils.iriToUri(u'https://www.google.ca/search?q=%s&lr=lang_en' %(query.replace(u' ', u'+'))) - searchPage = utils.iriToUri(u'https://www.google.ca/search?q=%s+-site%3Awikipedia.org&lr=lang_en' %(query.replace(u' ', u'+'))) - searchPage = utils.iriToUri(u'https://www.google.ca/search?q=%s&lr=lang_en' %(query.replace(u' ', u'+'))) """ resultContentList = [] #we try to transform the query to an uri readeable string if it has an iri code in it query = utils.iriToUri(query) #google api information my_cse_id = u'010977947046578922081:vl9apgc5fic' api_key = open('apiKeyKnowledgeGraphAndCustomSearch.api_key').read() service = build('customsearch', 'v1', developerKey=api_key) #google search result (dict) try: if includeWikipedia != True: result = service.cse().list(q=query, cx=my_cse_id, excludeTerms='wikipedia.org', lr='lang_en').execute() else: result = service.cse().list(q=query, cx=my_cse_id, lr='lang_en').execute() #if we reach the 100/day limit of the google search api, we scrap the google search page using beautifulsoup except HttpError: #do we include wikipedia pages or not if includeWikipedia != True: searchPage = utils.iriToUri(u'https://www.google.ca/search?q=%s+-site%3Awikipedia.org&lr=lang_en' %(query.replace(u' ', u'+'))) else: searchPage = utils.iriToUri(u'https://www.google.ca/search?q=%s&lr=lang_en' %(query.replace(u' ', u'+'))) #prepare a beautiful soup searchPageRequest = urllib2.Request(searchPage, headers={u'User-Agent' : u"Magic Browser"}) searchPageObject = urllib2.urlopen(searchPageRequest) searchPageSoup = BeautifulSoup(searchPageObject.read(), 'lxml', from_encoding=searchPageObject.info().getparam(u'charset')) #get the pages urls of the search searchResultsWrapList = (searchPageSoup.body).findAll(u'h3', {u'class' : u'r'}) #and we save them in a list result = [] for h3Tag in searchResultsWrapList: googleLinkInfo = (h3Tag.find(u'a', {u'href':True})[u'href']) splittedLinkInfo = (re.compile(ur'=|&')).split(googleLinkInfo) if splittedLinkInfo[0] == u'/url?q': url = splittedLinkInfo[1] result.append(url) #getting the urls from the first google search results (using the google API) for indexResult in range(nbOfPages*2): try: #catching the url #if we used the google API if type(result) is not list: searchResult = result[u'items'][indexResult] googleSearchUrl = searchResult[u'link'] #if we didn't use the API but beautifulsoup else: googleSearchUrl = result[indexResult] #we try to transform the url to an uri readeable string if it has an iri code in it googleSearchUrl = utils.iriToUri(googleSearchUrl) #scrapping each search result content #prepare a beautiful soup pageRequest = urllib2.Request(googleSearchUrl, headers={'User-Agent' : "Magic Browser"}) pageObject = urllib2.urlopen(pageRequest) encoding = pageObject.info().getparam(u'charset') #if there is no encoding specified, we suppose it's probably utf-8 if encoding in [None, u'None', 'None', 'none', '']: encoding = u'utf-8' pageSoup = BeautifulSoup(pageObject.read(), 'lxml', from_encoding=encoding) #getting the content out of the url pages pageText = pageSoup.body.text #we make sure the page is in english if langid.classify(pageText)[0] == u'en': #cleaning and filtering the text newPageText = '' pageLinesList = pageText.split(u'\n') for indexLine in range(len(pageLinesList)): pageLine = pageLinesList[indexLine] lineChars = len(pageLine) #the input file has a limit of characters per line o 4096 char #so we don't take into account the lines that are bigger than 4090 if lineChars > 4090: pass #ollie seems to have a data limit, #we have discarded nb of line limit, nb of char limit, time limit, data weight of input.txt limit, #so we have limited the size of lines taken into account (we only take the bigger than 250char) elif lineChars < 250: pass elif u'{' in pageLine or u'}' in pageLine: pass elif pageLine in [u'', u' ']: pass else: newPageText = newPageText + pageLine.replace(u'\t', '') + u'\n' resultContentList.append(newPageText) #if we achieve the desired n umber of pages we break the loop if len(resultContentList) >= nbOfPages: break #in case the desired nb of pages exceeds the desired number or is less than 1 except IndexError: pass #if we cannot open the page using urllib, we pass except HTTPError: pass #if the page content or its body content is None we pass except AttributeError: pass #error: [Errno 104] Connection reset by peer or <urlopen error [Errno 110] Connection timed out> except Exception: pass #returning the list of text return resultContentList #--------------END-FONCTIONS------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #--------------START-DECLARATIONS------------------- #--------------END-DECLARATIONS------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #--------------START-COMMANDES------------------- #--------------END-COMMANDES------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #--------------START-ERRORS------------------- #--------------END-ERRORS------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #--------------START-REFERENCES------------------- #--------------END-REFERENCES------------------- #-------------------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------------------
def getIntroWikipedia(namedEntity, returnAlist=True): ''' Scraps Wikipedia's page and catches the intro of the article (so it can, later, be passed throught the Open Information Extractors). ''' paragraphIntroList = [] wikidataUrl = None #get the article name by disambiguating articleName = namedEntity #articleName = getTheRightSearchQuery(namedEntity, wikiOrGoogleOriented='w') if articleName != None: articleNameNoSpace = articleName.replace(u' ', u'_') #we try to transform the article name to an uri readeable string if it has an iri code in it articleNameNoSpace = utils.iriToUri(articleNameNoSpace) #if we don't find the right querry we return None else: return None #get the article url articleUrl = 'https://en.wikipedia.org/wiki/%s' %(articleNameNoSpace) try: #prepare a beautiful soup articleObject = urllib2.urlopen(articleUrl.encode('utf8')) #if there is an http error it means the page has an entry but doesn't exist so we return None except HTTPError: return None articleSoup = BeautifulSoup(articleObject.read(), 'lxml', from_encoding=articleObject.info().getparam(u'charset')) #get the first section-separated content articleContentDiv = articleSoup.body.find(u'div', {u'id' : u'toc'}) #if there is no toc section (introduction), we search for the text appearing before the h2 tag (sub-title) if articleContentDiv == None: articleContentDiv = articleSoup.body.find(u'h2') #if it's an incomplete page we return None if articleContentDiv != None: #get the content of the previous paragraphs (aka the intro) articleIntroList = articleContentDiv.findAllPrevious(u'p') #cleaning the html intro list obtained wikiReference = re.compile(ur'\[[\d]*\]|\[.*needed\]|\[not verified.*\]|\[note.*\]|\(.*listen\)') for introParagraph in articleIntroList: introParagraphText = introParagraph.text #if its a disambiguation page, we return None if len(articleIntroList) <= 2 and u'refer to:' in introParagraphText: return None #if it's not a disambiguation page else: if len(introParagraphText) != 0: introReferenceList = re.findall(wikiReference, introParagraphText) #if there are references if len(introReferenceList) != 0: for reference in introReferenceList: introParagraphText = introParagraphText.replace(reference, u'') cleanedParagraphText = introParagraphText #if there are no references else: cleanedParagraphText = introParagraphText #add to the list paragraphIntroList.append(cleanedParagraphText) #we scrap the wikidata url from wikipedia wikidataRow = articleSoup.body.find(u'a', {u'title' : u'Link to connected data repository item [g]'}) if wikidataRow != None: wikidataUrl = wikidataRow.attrs[u'href'] #if the page doesn't exist we return None else: return None #if it's an incomplete page we return None else: return None #we put the list in the order it appears in wikipedia paragraphIntroList.reverse() #we return the intro, the article name and the wikidata url if returnAlist == True: return paragraphIntroList, articleNameNoSpace, wikidataUrl else: return u' '.join(paragraphIntroList), articleNameNoSpace, wikidataUrl
def getGoogleKnowledgeGraph(query, queueKnowGraph=None): """ Returns a dictionary containing the google knowledge graph information. #https://developers.google.com/apis-explorer/#p/kgsearch/v1/ """ dictGoogleKnowGraph = {} api_key = open('apiKeyKnowledgeGraphAndCustomSearch.api_key').read() service_url = 'https://kgsearch.googleapis.com/v1/entities:search' #we try to make the query readeable for the adress bar query = utils.toUtf8(query) #tramsform all iri code in the query to uri readeable query = utils.iriToUri(query) params = { 'query': query, 'limit': 10, 'indent': True, 'key': api_key, } parameters = (urllib.urlencode(params)) url = service_url + '?' + parameters response = json.loads(urllib.urlopen(url).read()) try: #if we find an empty knowledge graph we pass, the dict will be empty and we will return None if len(response[u'itemListElement']) == 0: pass else: #possible entities in order of probability score bestResultElement = response[u'itemListElement'][0] elementContentDict = bestResultElement[u'result'] #content of the most probable entity for elementContentIndex in range(len(elementContentDict)): elementContentKey = elementContentDict.keys( )[elementContentIndex] elementContent = elementContentDict[elementContentKey] #we discard all keys with the word image in them if u'image' not in elementContentKey: #we treat the information differently depending on #the type of content information elementContentType = type(elementContent) #if it's unicode we add directly to the dict if elementContentType is unicode: dictGoogleKnowGraph[u'%s000000.%s' % (str(elementContentIndex).zfill(3), elementContentKey.replace( u'@', u''))] = elementContent #if it's a list every sub-element will be saved under the same key name #(thought the number differs) elif elementContentType is list: for subElementIndex, subElement in enumerate( elementContent): dictGoogleKnowGraph[ u'%s%s000.%s' % (str(elementContentIndex).zfill(3), str(subElementIndex).zfill(3), elementContentKey.replace(u'@', u''))] = subElement #if it's a dict we save the sub-element corresponding to unicode or a list elif elementContentType is dict: for subElementIndex in range(len(elementContent)): subElementKey = elementContent.keys( )[subElementIndex] subElement = elementContent[subElementKey] subElementType = type(subElement) if subElementType is unicode: dictGoogleKnowGraph[ u'%s000%s.%s.%s' % (str(elementContentIndex).zfill(3), str(subElementIndex).zfill(3), elementContentKey.replace(u'@', u''), subElementKey.replace(u'@', u''))] = subElement else: raise TypeError('unexpected type of value', subElementType) else: raise TypeError('unexpected type of value', elementContentType) #complete the know graph dict the old fashion way: scraping dictGoogleKnowGraph = completeGoogleKnowledgeGraph( dictGoogleKnowGraph, query) #if the page we found is not a knowledge graph page we pass, the dict will be empty and we will return None except KeyError: pass #if we reach the 100/day limit of the google search api, we scrap the google search page using beautifulsoup except HttpError: #we try to fill the know graph dict the old fashion way: scraping dictGoogleKnowGraph = completeGoogleKnowledgeGraph( dictGoogleKnowGraph, query) #if the dict is empty we return None if len(dictGoogleKnowGraph) == 0: if queueKnowGraph != None: queueKnowGraph.put(None) return None #saving in queue and returning the dict if queueKnowGraph != None: queueKnowGraph.put(dictGoogleKnowGraph) return dictGoogleKnowGraph #--------------END-DECLARATIONS------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #--------------START-COMMANDES------------------- #--------------END-COMMANDES------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #--------------START-ERRORS------------------- #--------------END-ERRORS------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------- #--------------START-REFERENCES------------------- #--------------END-REFERENCES------------------- #-------------------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------------------
def completeGoogleKnowledgeGraph(dictGoogleKnowGraph, query): ''' We try to extract more information from the google search page to try and complete the knowledge graph dict ''' searchPage = utils.iriToUri( u'https://www.google.ca/search?q=%s&lr=lang_en' % (query.replace(u' ', u'+'))) #first we reformat all the dates already in the dict except for the ones in the description for keyKG in dictGoogleKnowGraph: #if it's not the entity description if u'articleBody' not in keyKG: dictGoogleKnowGraph[keyKG] = utils.reformatDates( dictGoogleKnowGraph[keyKG]) #prepare a beautiful soup searchPageRequest = urllib2.Request( searchPage, headers={u'User-Agent': u"Magic Browser"}) searchPageObject = urllib2.urlopen(searchPageRequest) searchPageSoup = BeautifulSoup( searchPageObject.read(), 'lxml', from_encoding=searchPageObject.info().getparam(u'charset')) #get the pages urls of the search searchKnowGraphInfobox = (searchPageSoup.body).findAll( u'div', {u'class': u'_o0d'}) #for each type of fact for indexDiv, div in enumerate(searchKnowGraphInfobox): divText = div.text #catching the name and type if indexDiv == 0: divChildren = div.findChildren()[1:] #we fill the dict if the name and/or type info is missing if len(divChildren) == 1 and divChildren[ 0].text not in dictGoogleKnowGraph.values(): dictGoogleKnowGraph[u'%s000000.%s' % (str(indexDiv).zfill(3), u'name')] = divChildren[0].text elif len(divChildren) == 2: if divChildren[0].text not in dictGoogleKnowGraph.values(): dictGoogleKnowGraph[u'000000000.%s' % (u'name')] = divChildren[0].text if divChildren[1].text not in dictGoogleKnowGraph.values(): dictGoogleKnowGraph[u'000000001.%s' % (u'type')] = divChildren[1].text else: pass #catching other infos else: knowGraphDateFormat = re.compile( ur'((.+): (January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) [\d]?[\d], [\d]*)|((Born:|Died:) [\d]+)' ) #catching the description if divText[-10:len(divText)] == u' Wikipedia': descriptionText = divText[:-10] if descriptionText not in dictGoogleKnowGraph.values(): dictGoogleKnowGraph[ u'%s000000.%s' % (str(indexDiv).zfill(3), u'detailedDescription.articleBody')] = descriptionText #catching and modifying the format of dates (except in the description) elif knowGraphDateFormat.match(divText) != None: divText = utils.reformatDates(divText) #we catch the first column so we can separate the relation of the fact firstColumn = divText.index(u':') relFact = divText[:firstColumn] #we save to the dict #if there is a coma after the date, we separate the 2 informations if u', ' in divText: firstComa = divText.index(u',') if divText[(firstColumn + 2):] not in dictGoogleKnowGraph.values(): if divText[ (firstColumn + 2):firstComa] not in dictGoogleKnowGraph.values(): #date dictGoogleKnowGraph[u'%s000000.%s' % (str(indexDiv).zfill(3), relFact)] = divText[( firstColumn + 2):firstComa] if divText[(firstComa + 2):] not in dictGoogleKnowGraph.values(): #place dictGoogleKnowGraph[u'%s000001.%s' % (str(indexDiv).zfill(3), relFact)] = divText[( firstComa + 2):] else: if divText[(firstColumn + 2):] not in dictGoogleKnowGraph.values(): #date dictGoogleKnowGraph[u'%s000000.%s' % (str(indexDiv).zfill(3), relFact)] = divText[(firstColumn + 2):] #catching the tables elif len(div.findAll(u'table')) != 0: tableName = div.find(u'div', { u'class': re.compile(ur'.*') }).text #we catch the info in the table row by row, cell by cell tableData = [] tableBody = div.find('tbody') rows = tableBody.findAll('tr') for row in rows: cols = row.findAll('td') cols = [ele.text.strip() for ele in cols] tableData.append([ele for ele in cols if ele]) # Get rid of empty values #we save to the dict for indexData, data in enumerate(tableData): #if it's a simple table (one column), otherwise it's too complex to add to the dict if len(data) == 1: dictGoogleKnowGraph[u'%s%s000.%s' % (str(indexDiv).zfill(3), str(indexData).zfill(3), tableName)] = data[0] #catching any other info elif len(divText) > 0 and u': ' in divText: divText = utils.reformatDates(divText) firstColumn = divText.index(u':') relFact = divText[:firstColumn] dictGoogleKnowGraph[u'%s000000.%s' % (str(indexDiv).zfill(3), relFact)] = divText[(firstColumn + 2):] #empty data else: #print(div) pass return dictGoogleKnowGraph
def getInfoWikidata(namedEntity, queueWikiData=None, lang='en', noDisambiguationSolving=True): ''' Returns a dictionary containing all the wikidata data in an easy human readeable way otherwise: None. ''' #get the right search query name by disambiguating searchQuery = namedEntity ###################################### ###searchQuery = getTheRightSearchQuery(namedEntity, wikiOrGoogleOriented='w') try: print(searchQuery) except UnicodeEncodeError: pass #prepare the pywikibot objects site = pywikibot.Site(lang, 'wikipedia') repository = site.data_repository() page = pywikibot.Page(site, searchQuery) try: item = pywikibot.ItemPage.fromPage(page) #we need to call the item (by using '.get()') to access the data item.get() #get the url wikidataUrl = item.full_url() #we try to transform the url to an uri readeable string if it has an iri code in it wikidataUrl = utils.iriToUri(wikidataUrl) #prepare a beautiful soup articleObject = urllib2.urlopen(wikidataUrl) articleSoup = BeautifulSoup( articleObject.read(), 'lxml', from_encoding=articleObject.info().getparam(u'charset')) #we save the pywikibot ojects in a list pywikibotObjList = [repository, item, articleSoup] #if we encounter a KeyError: u'upperBound' due to the ± character except KeyError: #defining beautiful soup objects pageUrl = page.full_url() #we try to transform the url to an uri readeable string if it has an iri code in it pageUrl = utils.iriToUri(pageUrl) articleObject = urllib2.urlopen(pageUrl) articleSoup = BeautifulSoup( articleObject.read(), 'lxml', from_encoding=articleObject.info().getparam(u'charset')) #catching the page url itemUrl = articleSoup.find( u'a', {u'title': u'Link to connected data repository item [g]'})[u'href'] #we save the pywikibot oject (item url) in a list pywikibotObjList = [itemUrl] #if no page corresponds to the search except pywikibot.NoPage: print('ERROR no page', namedEntity) #saving in queue and returning the dict if queueWikiData != None: queueWikiData.put(None) #we return an empty dict return None #if it takes too much time for the API or the API breaks we return an empty dict except APIError: print('API ERROR maxlag', namedEntity) #saving in queue and returning the dict if queueWikiData != None: queueWikiData.put(None) #we return an empty dict return None #we get the information in dict form dictInfoWkdata = getInfoWikidataFromNet(namedEntity, queueWikiData, lang, pywikibotObjList, noDisambiguationSolving) #if the dict is empty we return None if len(dictInfoWkdata) == 0: if queueWikiData != None: queueWikiData.put(None) return None return dictInfoWkdata
def getInfoWkdataWithBtfulSoup(itemUrl, dictInfoWkdata={}, allInfo=True, lang=u'en'): ''' Scraps a wikidata site info using beautiful soup and returns a dict IF we do not scrap all the info, then we obtain the requested info, not a dict If we want to scrap some specific information instead #of all the page information, we need to specify it by replacing the allInfo argument with the type of information we're looking for: 'l' 'label' :: the string label of the article 'd' 'description' :: the string description of the article 'a' 'aliases' :: the list of aliases of the article 'c' 'claims' :: the list of claims of the article ''' #we try to transform the url to an uri readeable string if it has an iri code in it itemUrl = utils.iriToUri(itemUrl) articleObject = urllib2.urlopen(itemUrl) articleSoup = BeautifulSoup( articleObject.read(), 'lxml', from_encoding=articleObject.info().getparam(u'charset')) #information contained in tables articleTable = articleSoup.body.findAll('table') for tableContent in articleTable: #LABEL articleLabel = (tableContent.find( u'span', {u'class': u'wikibase-labelview-text'})).string dictInfoWkdata[u'000000000.Label'] = articleLabel #specific info return if allInfo != True: if allInfo.lower() in [u'label', u'lab', u'l']: return articleLabel #DESCRIPTION articleDescription = (tableContent.find( u'span', {u'class': u'wikibase-descriptionview-text'})).string dictInfoWkdata[u'001000000.Description'] = articleDescription #specific info return if allInfo != True: if allInfo.lower() in [u'description', u'descrip', u'd']: return articleDescription #ALIAS articleAliasesList = tableContent.findAll( u'li', {u'class': u'wikibase-aliasesview-list-item'}) aliasesList = [] #saving each alias for indexL in range(len(articleAliasesList)): articleAlias = (articleAliasesList[indexL]).string dictInfoWkdata[u'002%s000.Aliases' % (str(indexL).zfill(3))] = articleAlias aliasesList.append(articleAlias) #specific info return if allInfo != True: if allInfo.lower() in [u'aliases', u'alias', u'a']: return aliasesList #information otherwise located #CLAIMS articleClaims = articleSoup.body.findAll( u'div', {u'class': u'wikibase-statementgrouplistview'}) claimList = [] oldKeyName = u'' for indexXl, claimSection in enumerate(articleClaims): claimTypeList = claimSection.findAll( u'div', {u'class': u'wikibase-statementgroupview'}) #information divided by type for indexL, claimType in enumerate(claimTypeList): #capturing the label claimLabel = claimType.find( u'div', {u'class': u'wikibase-statementgroupview-property-label'}) for labelContainer in claimLabel: keyName = (labelContainer.string) #capturing the informations claimContent = claimType.findAll( u'div', {u'class': u'wikibase-snakview-value-container'}) langKeyNameList = [] keyNameList = [] for indexM, contentContainer in enumerate(claimContent): contentValues = contentContainer.findAll( u'div', { u'class': u'wikibase-snakview-value wikibase-snakview-variation-valuesnak' }) #capturing each row of information for each type of information for contentRow in contentValues: #if the row contains information divided by language, we only select the info corresponding to the concerning language if (contentRow.find(u'span', {u'lang': lang})) != None: contentStringsList = contentRow.strings langKeyNameList.append(keyName) #if the language we want isn't in the right language, we pass elif (contentRow.find( u'span', {u'lang': re.compile(ur'[a-z]*')})) != None: contentStringsList = None langKeyNameList.append(keyName) #if the info is a appendice of a language-separated info elif keyName in langKeyNameList: pass #if the info is new and non language-divided else: contentStringsList = contentRow.strings #if we have an info to add to the dict, we add it if contentStringsList != None: value = u'' for cntntString in contentStringsList: value += unicode(cntntString) #save in dict keyInfo = u'%s%s%s.%s' % ( str(indexXl + 900).zfill(3), str(indexL).zfill(3), str(indexM).zfill(3), keyName) #if the key does not exist yet if keyInfo not in dictInfoWkdata: #if the value is not empty and the key + value is not already in the dict if value != u'' and keyName not in keyNameList and value not in dictInfoWkdata.values( ): wordsRe = re.compile(ur'[\w]+', re.UNICODE) valueList = wordsRe.findall(value.lower()) #avoiding wikipedia references and images if u'wikipedia' not in valueList and u'jpg' not in valueList and u'gif' not in valueList: dictInfoWkdata[keyInfo] = value claimList.append(value) keyNameList.append(keyName) else: pass