def classFinder(url, articleEncodedTitle, desiredCategory): qualityClass = "No-Class" importanceClass = "No-Class" desiredCategoryFound = False print("Finding class for url: " + url) if "Category:" in url or "User:"******"Talk:" in url or "User_talk:" in url or "Book:" in url: return "No-Class", "No-Class", False # Find the talk pahe hyperlink. talkPageHyperlink = 'http://en.wikipedia.org/wiki/Talk:' + articleEncodedTitle # Convert the hyperlink to absolute if it is relative. # talkPageHyperlink = urljoin(url, talkPageHyperlink) # Retrieve url content, and convert it into the beautifulsoup structure. talkSoup = soupStructure(talkPageHyperlink) # talkSoup = scrapeWebsite(talkPageHyperlink, 'div', 'id', "catlinks", "b", "text", "Wikipedia does not have a") errorCounter = 0 while (talkSoup == '' or (talkSoup.find('div', id="catlinks") == None and talkSoup.find('b', text="Wikipedia does not have a") == None)) and errorCounter <= 10: # print "talkPageHyperlink:", talkPageHyperlink, "is not found. Please enter a new one:" # talkPageHyperlink = raw_input() talkSoup = soupStructure(talkPageHyperlink) errorCounter += 1 while True: try: r = requests.get( 'https://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions&titles=' + articleEncodedTitle) r = r.json() query = r['query'] pages = query['pages'] break except Exception, e: print( "\n\n\nI cannot retrieve the revision ID of this article! Error Message: " + str(e)) time.sleep(1)
def WikipediaStatsGenerator(recommendationsFile): previousURLs = [] with open(recommendationsFile + '.csv', 'rb') as fr: reader = csv.reader(fr) with open(recommendationsFile + '_Stats.csv', 'wb') as fw: writer = csv.writer(fw) pubResultRow = [ 'ID', 'Title', 'Edit Protection Level', 'Class', 'Importance', 'Page Length', '# watchers', 'Time of Last Edit', '# redirects to this page', 'Page Creation Date', 'Total # edits', 'Recent # edits (within past 30 days)', 'Recent # distinct authors', '# views (last 90 days)', 'Total # references', '# references published after 2010', '# External Hyperlinks' ] writer.writerow(pubResultRow) header = next(reader) for row in reader: for i in range(2, len(row) - 1, 3): if row[i] != "" and row[i + 1] != "": wikipageTitle = row[i] print("Wikipage Title:", wikipageTitle) wikipageURL = row[i + 1] print("Wikipage URL:", wikipageURL) wikipediaSoup = soupStructure(wikipageURL) print("Soup is Retrieved.") if wikipediaSoup != "" and not wikipageURL in previousURLs: pubResultRow = WikipediaPageStats( wikipageURL, wikipediaSoup, wikipageTitle, 'Econ') trialNum = 0 while pubResultRow == [] and trialNum < 10: trialNum += 1 # print("wikipageURL:", wikipageURL, "is not found. Please enter a new one:") # wikipageURL = input() pubResultRow = WikipediaPageStats( wikipageURL, wikipediaSoup, wikipageTitle, 'Econ') print(pubResultRow) writer.writerow(pubResultRow) previousURLs.append(wikipageURL)
def classFinder(url, articleEncodedTitle, desiredCategory): qualityClass = "No-Class" importanceClass = "No-Class" desiredCategoryFound = False print("Finding class for url: " + url) if "Category:" in url or "User:"******"Talk:" in url or "User_talk:" in url or "Book:" in url: return "No-Class", "No-Class", False talkPageHyperlink = 'http://en.wikipedia.org/wiki/Talk:' + articleEncodedTitle # Convert the hyperlink to absolute if it is relative. # talkPageHyperlink = urljoin(url, talkPageHyperlink) talkSoup = soupStructure(talkPageHyperlink) # talkSoup = scrapeWebsite(talkPageHyperlink, 'div', 'id', "catlinks", "b", "text", "Wikipedia does not have a") errorCounter = 0 while (talkSoup == '' or (talkSoup.find('div', id="catlinks") == None and talkSoup.find('b', text="Wikipedia does not have a") == None)) and errorCounter <= 10: # print("talkPageHyperlink:", talkPageHyperlink, "is not found. Please enter a new one:") # talkPageHyperlink = input() talkSoup = soupStructure(talkPageHyperlink) errorCounter += 1 while True: try: r = requests.get( 'https://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions&titles=' + articleEncodedTitle) r = r.json() query = r['query'] pages = query['pages'] break except (e): print( "\n\n\nI cannot retrieve the revision ID of this article! Error Message: " + str(e)) time.sleep(1) try: revisions = pages.values()[0]['revisions'] revisionID = revisions[0]['revid'] except: return "No-Class", "No-Class", False qualityDict = None while True: try: r = requests.get('https://ores.wmflabs.org/scores/enwiki/wp10/' + str(revisionID)) print("ores.wmflabs.org responded: " + str(r)) r = r.json() qualityDict = r[str(revisionID)] innerIndex = 0 while 'prediction' not in qualityDict and innerIndex < 4: qualityDict = qualityDict['score'] innerIndex = innerIndex + 1 print("Found score in qualityDict for the " + str(innerIndex) + " time.") qualityClass = qualityDict['prediction'] break except (e): print( "\n\n\nI cannot retrieve the class of this article from the API! Error Message: " + str(e)) time.sleep(1) probabilitiesOfEachClass = qualityDict['probability'] FAClassProbability = probabilitiesOfEachClass['FA'] GAClassProbability = probabilitiesOfEachClass['GA'] BClassProbability = probabilitiesOfEachClass['B'] CClassProbability = probabilitiesOfEachClass['C'] StartClassProbability = probabilitiesOfEachClass['Start'] StubClassProbability = probabilitiesOfEachClass['Stub'] weightedAverage = ( float(StubClassProbability) + 2 * float(StartClassProbability) + 3 * float(CClassProbability) + 4 * float(BClassProbability) + 5 * float(GAClassProbability) + 6 * float(FAClassProbability)) / ( float(StubClassProbability) + float(StartClassProbability) + float(CClassProbability) + float(BClassProbability) + float(GAClassProbability) + float(FAClassProbability)) if weightedAverage > 5: qualityClass = "FA-Class" elif weightedAverage > 4: qualityClass = "GA-Class" elif weightedAverage > 3: qualityClass = "B-Class" elif weightedAverage > 2: qualityClass = "C-Class" elif weightedAverage > 1: qualityClass = "Start-Class" else: qualityClass = "Stub-Class" print("qualityClass: " + qualityClass) if talkSoup != '' and talkSoup.find( 'div', id="catlinks") != None and talkSoup.find( 'b', text="Wikipedia does not have a") == None: categoryDIVTag = talkSoup.find('div', id="catlinks") # if categoryDIVTag.find(text=re.compile('.*FA-Class.*')) != None: # qualityClass = "FA-Class" # elif categoryDIVTag.find(text=re.compile('.* A-Class.*')) != None: # qualityClass = "A-Class" # elif categoryDIVTag.find(text=re.compile('.*GA-Class.*')) != None: # qualityClass = "GA-Class" # elif categoryDIVTag.find(text=re.compile('.*B+ class.*')) != None: # qualityClass = "B+ class" # elif categoryDIVTag.find(text=re.compile('.*B-Class.*')) != None: # qualityClass = "B-Class" # elif categoryDIVTag.find(text=re.compile('.*C-Class.*')) != None: # qualityClass = "C-Class" # elif categoryDIVTag.find(text=re.compile('.*Stub-Class.*')) != None: # qualityClass = "Stub-Class" if categoryDIVTag.find(text=re.compile('.*Top-importance.*')) != None: importanceClass = "Top-importance" elif categoryDIVTag.find( text=re.compile('.*High-importance.*')) != None: importanceClass = "High-importance" elif categoryDIVTag.find( text=re.compile('.*Mid-importance.*')) != None: importanceClass = "Mid-importancee" elif categoryDIVTag.find( text=re.compile('.*Low-importance.*')) != None: importanceClass = "Low-importance" elif categoryDIVTag.find(text=re.compile('.*NA-importance.*')) != None: importanceClass = "NA-importance" elif categoryDIVTag.find( text=re.compile('.*Unknown-importance.*')) != None: importanceClass = "Unknown-importance" elif categoryDIVTag.find( text=re.compile('.*Bottom-importance.*')) != None: importanceClass = "Bottom-importance" if desiredCategory.lower() in categoryDIVTag.prettify().lower(): desiredCategoryFound = True return qualityClass, importanceClass, desiredCategoryFound
def IsWikipageAppropriate(title, hyperlink): if ("Category:" in title or "User:"******"Talk:" in title or "User talk:" in title or "Book:" in title): return False, None wikipediaSoup = soupStructure(hyperlink) print("Wikipedia page Soup is Retrieved.") trialNum = 0 while wikipediaSoup == "" and trialNum < 10: trialNum += 1 print("Wikipedia page:" + hyperlink + " Soup is not retreived. Please enter an appropriate URL:") # hyperlink = input() # if hyperlink == "1": # return False, None wikipediaSoup = soupStructure(hyperlink) if wikipediaSoup != "": resultRow = WikipediaPageStats( hyperlink, wikipediaSoup, title, 'list') trialNum = 0 while resultRow == [] and trialNum < 10: trialNum += 1 print("wikipageURL:", hyperlink, "is not found. Please enter a new one:") # hyperlink = input() # if hyperlink == "1": # return False, None wikipediaSoup = soupStructure(hyperlink) innerTrialNum = 0 while wikipediaSoup == "" and innerTrialNum < 10: innerTrialNum += 1 print("Wikipedia page:" + hyperlink + " Soup is not retreived. Please enter an appropriate URL:") # hyperlink = input() # if hyperlink == "1": # return False, None wikipediaSoup = soupStructure(hyperlink) resultRow = WikipediaPageStats( hyperlink, wikipediaSoup, title, 'list') if resultRow == []: return False, None print("Wikipedia page Stats:", resultRow) # If the edit protection os the page is not None: if resultRow["editProtectionLevel"].lower() != "none": print("The Wikipedia page is edit protected. Do not recommend it.") return False, None if resultRow["qualityClass"].lower() == "stub-class": print("The Wikipedia page is a Stub. Do not recommend it.") return False, None # if resultRow[3].lower() == "b-class": # print "The Wikipedia page is a B-Class. Do not recommend it." # return False, None # if resultRow[3].lower() == "b+ class": # print "The Wikipedia page is a B+ class. Do not recommend it." # return False, None # if resultRow[3].lower() == "ga-class": # print "The Wikipedia page is a GA-Class. Do not recommend it." # return False, None # if resultRow[3].lower() == "a-class": # print "The Wikipedia page is a A-Class. Do not recommend it." # return False, None # if resultRow[3].lower() == "fa-class": # print "The Wikipedia page is a FA-Class. Do not recommend it." # return False, None if num(resultRow["viewsNum"]) < 1000: print( "The Wikipedia page has been viewed less than 1000 times. Do not recommend it.") return False, None print("The Wikipedia page is OK to recommend.") return True, resultRow
def IsWikipageAppropriate(title, hyperlink): if ("Category:" in title or "User:"******"Talk:" in title or "User talk:" in title or "Book:" in title or "Template:" in title): return False, None wikipediaSoup = soupStructure(hyperlink) print("Wikipedia page Soup is Retrieved.") trialNum = 0 while wikipediaSoup == "" and trialNum < 10: trialNum += 1 print("Wikipedia page:" + hyperlink + " Soup is not retreived. Please enter an appropriate URL:") # hyperlink = input() # if hyperlink == "1": # return False, None wikipediaSoup = soupStructure(hyperlink) if wikipediaSoup != "": resultRow = WikipediaPageStats(hyperlink, wikipediaSoup, title, 'list') trialNum = 0 while resultRow == [] and trialNum < 10: trialNum += 1 print("wikipageURL:", hyperlink, "is not found. Please enter a new one:") # hyperlink = input() # if hyperlink == "1": # return False, None wikipediaSoup = soupStructure(hyperlink) innerTrialNum = 0 while wikipediaSoup == "" and innerTrialNum < 10: innerTrialNum += 1 print( "Wikipedia page:" + hyperlink + " Soup is not retreived. Please enter an appropriate URL:") # hyperlink = input() # if hyperlink == "1": # return False, None wikipediaSoup = soupStructure(hyperlink) resultRow = WikipediaPageStats(hyperlink, wikipediaSoup, title, 'list') if resultRow == []: return False, None print("Wikipedia page Stats:", resultRow) # If the edit protection os the page is not None: # if resultRow[2].lower() != "none": # print("The Wikipedia page is edit protected. Do not recommend it.") # return False, None if resultRow['qualityClass'].lower( ) == "stub-class" or resultRow['qualityClass'].lower() == "start": print("The Wikipedia page is a " + resultRow['qualityClass'] + ". Do not recommend it.") return False, None if resultRow['importanceClass'].lower( ) != 'high-importance' and resultRow['importanceClass'].lower( ) != 'top-importance': print("The Wikipedia page is a " + resultRow['importanceClass'] + ". Do not recommend it.") return False, None if num(resultRow['length']) < 1000: print( "The Wikipedia page has been viewed less than 1000 times. Do not recommend it." ) return False, None print("The Wikipedia page is OK to recommend.") return True, resultRow
def econPapersCategoriesExtractor(ideasURL, startSpecialization, endSpecialization): with open('Ideas_Repec_Dataset.csv', 'wb') as fw: writer = csv.writer(fw) with open('Ideas_Repec_Affiliations.csv', 'wb') as fwAffiliations: writerAffiliations = csv.writer(fwAffiliations) resultRow = [ 'firstName', 'lastName', 'email', 'specialization', 'EconPapers Profile', 'affiliation', 'location', 'homepage', 'publication1', 'publicationYear1', 'citation1', 'firstKeyword1', 'publication2', 'publicationYear2', 'citation2', 'firstKeyword2', 'publication3', 'publicationYear3', 'citation3', 'firstKeyword3', 'publication4', 'publicationYear4', 'citation4', 'firstKeyword4', 'publication5', 'publicationYear5', 'citation5', 'firstKeyword5', 'publication6', 'publicationYear6', 'citation6', 'firstKeyword6', 'publication7', 'publicationYear7', 'citation7', 'firstKeyword7' ] writer.writerow(resultRow) resultRowAffiliations = [ 'firstName', 'lastName', 'email', 'affiliation1', 'location1', 'affiliation2', 'location2', 'affiliation3', 'location3', 'affiliation4', 'location4', 'affiliation5', 'location5', 'affiliation6', 'location6', 'affiliation7', 'location7' ] writerAffiliations.writerow(resultRowAffiliations) ideasSoup = soupStructure(ideasURL) # If the content of the page is returned in BeautifulSoup structure: if ideasSoup != '': # Find the main list. mainList = ideasSoup.body.find( text=re.compile('.*Accounting & Auditing.*')).parent.parent # If there is a main list: if mainList != None: # Set if the startSpecialization is observed. startSpecializationObserved = False # Find all li tags inside the main list. liTags = mainList.findAll('li') for i in range(len(liTags)): # Find the hyperlink tag inside the list item. aTag = liTags[i].find('a') specialization = aTag.nextSibling[1:] print str(specialization) if specialization == startSpecialization: startSpecializationObserved = True if specialization != "All new papers" and specialization != "German Papers" and startSpecializationObserved: econPapersAuthorListExtractor( writer, writerAffiliations, specialization, 'https://ideas.repec.org' + aTag['href']) # if foundTheLastSubject: # return if specialization == endSpecialization: return
def econPapersAuthorListExtractor(writer, writerAffiliations, specialization, ideasURL): # Retrieve the url html page content and convert it into BeautifulSoup structure. ideasSoup = soupStructure(ideasURL) # If the content of the page is returned in BeautifulSoup structure: while ideasSoup == '': ideasURL = raw_input() ideasSoup = soupStructure(ideasURL) # while True: # try: # # Set if startFirstname is observed. # startFirstnameObserved = False # # Set if startLastname is observed. # startLastnameObserved = False # Find the main tables list. tablesList = ideasSoup.body.findAll('table') for tableIndex in range(2, len(tablesList)): # Find the main table. mainTable = tablesList[tableIndex] # If there is a main table: if mainTable != None: # Find all a tags inside the main table. aTags = mainTable.findAll('a') for i in range(len(aTags)): # if aTags[i].find(text=re.compile(startFirstname)) != None: # startFirstnameObserved = True # if aTags[i].find(text=re.compile(startLastname)) != None: # startLastnameObserved = True # If there is a question mark after the hyperlinked name, it means there is something wrong with it, so it's better to just ignore it. # if aTags[i].parent.find(text=re.compile('.*[?].*')) == None and startFirstnameObserved and startLastnameObserved: if aTags[i].parent.find(text=re.compile('.*[?].*')) == None: try: # Define an array as the return variable. returnVariable, affiliationReturnValue = econPapersProfileExtractor( 'https://ideas.repec.org' + aTags[i]['href'], specialization) writer.writerow(returnVariable) writerAffiliations.writerow(affiliationReturnValue) print str(returnVariable) except: pass # if foundTheLastSubject: # return foundTheLastSubject # break # except: # ideasURL = raw_input() # ideasSoup = soupStructure(ideasURL) return False
def econPapersProfileExtractor(ideasURL, specialization): returnValue = [] affiliationReturnValue = [] # Retrieve the url html page content and convert it into BeautifulSoup structure. ideasSoup = soupStructure(ideasURL) # If the content of the page is returned in BeautifulSoup structure: while ideasSoup == '': ideasURL = raw_input() ideasSoup = soupStructure(ideasURL) # while True: # try: # Find the main DIV. obfuscateScriptMainDIV = ideasSoup.body.find('div', id="main") # If there is a main DIV: if obfuscateScriptMainDIV != None: # Find the FirstName tag. firstNameTag = ideasSoup.body.find(text=re.compile('.*First Name:.*')) if firstNameTag == None: firstName = '' else: # firstNameTag = strip_tags(firstNameTag) # # Find the first name. # firstName = firstNameTag[15:] firstNameTag = firstNameTag.parent.parent.findNext('td') # Find the first name. firstName = strip_tags( unicode(firstNameTag.renderContents(), 'utf8')) # Find the LastName tag. lastNameTag = ideasSoup.body.find(text=re.compile('.*Last Name:.*')) if lastNameTag == None: lastName = '' else: # lastNameTag = strip_tags(lastNameTag) # # Find the last name. # lastName = lastNameTag[14:] lastNameTag = lastNameTag.parent.parent.findNext('td') # Find the first name. lastName = strip_tags(unicode(lastNameTag.renderContents(), 'utf8')) locations = [] # Find all the Location tags. locationTags = ideasSoup.body.findAll(text=re.compile('.*Location:.*')) for locationTag in locationTags: locationTag = strip_tags(locationTag) # Find the location. locations.append(locationTag[10:]) affiliations = [] # Find the Affiliation tag. affiliationTags = ideasSoup.body.find( 'div', id='affiliation-body').findAll('h4') for affiliationTag in affiliationTags: # Find the affiliation. affiliation = strip_tags( unicode(affiliationTag.renderContents(), 'utf8')) affiliations.append(affiliation) # Find the Homepage tag. homepageParentTag = ideasSoup.body.find( text=re.compile('.*Homepage:.*')) if homepageParentTag == None: homepage = '' else: # Find the homepage tag. homepageTag = homepageParentTag.parent.parent.findNext( 'td').findNext('a') if homepageParentTag == None: homepage = '' else: # Find the homdepage. homepage = homepageTag['href'] browser.get(ideasURL) email = '' # Find the obfuscating email tag. emailTag = browser.find_element_by_xpath( "//*[@id='details-body']/table/tbody/tr[7]/td[2]") # If there is an email tag inside obfuscateScriptMainDIV: # if emailTag != None and emailTag.text != '[This author has chosen not to make the email address public]': if emailTag != None and not ' ' in emailTag.text: # Add the found email address in an appropriate format to returnVariable. email = emailTag.text print email # # Find the obfuscating script. # obfuscateScriptTag = obfuscateScriptMainDIV.find('div', id='details-body').find('span', {'data-liame2'}) # # If there is a script tag inside obfuscateScriptMainDIV: # if obfuscateScriptTag != None: # # Find the content of the obfuscating script. # obfuscateScriptContent = obfuscateScriptTag.renderContents() # # Find both email parts from the content of the obfuscating script. # emailParts = re.findall("'([^',]+)'", hparser.unescape(obfuscateScriptContent)) # # Add the found email address in an appropriate format to returnVariable. # returnVariable['email'] = liame2(emailParts) # Define publication array. publicationArray = [] specializations = [] specializationRepetition = [] # Find the list of publications. publicationsList = browser.find_element_by_id( 'works-group').find_elements_by_tag_name('li') # If there is any publication listed: if len(publicationsList) != 0: # For each publication: for publicationIndex in range(len(publicationsList)): # Extract the citation. citationText = convert_unicode( publicationsList[publicationIndex].text) # If there are multiple versions of the citation, only take the first one. citationTextFirst = re.split(r'\s*\n+\s*', citationText) # Extract Publication title. publicationTitleTag = publicationsList[ publicationIndex].find_element_by_tag_name('a') publicationTitle = publicationTitleTag.text notUnicodeCharacter = True try: print 'publicationTitle: ' + publicationTitle print 'citationText: ' + citationTextFirst[0] except: print "Unicode character found." notUnicodeCharacter = False # Check it publicationTitle is in ASCII (all English characters) continue: if is_ascii(publicationTitle) and notUnicodeCharacter: # Identify if this citation has been shown up previously. isNotAnotherVersion = True # For all citations listed previously: for citationIndex in range(len(publicationArray)): # if citation has been shown up previously: if publicationTitle.lower( ) in publicationArray[citationIndex][2].lower(): # Citation has been shown up previously. isNotAnotherVersion = False print citationText, "Is another version of", publicationArray[ citationIndex][2] break paperYearGroup = re.search("^\D+[,][ ](\d+)[.]", citationTextFirst[0]) if isNotAnotherVersion and paperYearGroup != None: paperYear = paperYearGroup.group(1) # Extract the first keyword of this paper from econpapers. # firstKeyword = keywordExtractor("http://econpapers.repec.org/scripts/search.pf?ft=" + urllib.quote_plus(publicationTitle.encode('utf-8'))) keywords, paperspecList = keywordAndSpecializationExtractor( publicationTitleTag.get_attribute("href"), 'http://econpapers.repec.org/scripts/search.pf?ft=' + urllib.quote_plus( ('"' + publicationTitle + '" ' + firstName + " " + lastName).encode('utf-8'))) if keywords != None and keywords != [] and paperspecList != None and paperspecList != []: print "keywords: ", keywords # Append the results to the returning array. publicationArray.append([ publicationTitle, paperYear, citationTextFirst[0], keywords, paperspecList ]) print "publicationArray: ", publicationArray publicationArray.sort(key=itemgetter(1), reverse=True) print "Sorted publicationArray: ", publicationArray specializationFound = False for publicationElement in publicationArray: paperspecList = publicationElement[4] if paperspecList != []: for paperSpec in paperspecList: if paperSpec in specializations: specializationRepetition[specializations.index( paperSpec)] += 1 if specializationRepetition[specializations.index( paperSpec)] == 7: specializationFound = True break else: specializations.append(paperSpec) specializationRepetition.append(1) if specializationFound: break print "specializations: ", specializations print "specializationRepetition: ", specializationRepetition maxSpex = -1 maxIndex = -1 for specIndex in range(len(specializationRepetition)): if specializationRepetition[specIndex] > maxSpex: maxSpex = specializationRepetition[specIndex] maxIndex = specIndex print "Specialization before:", specialization specialization = "" print publicationArray pubIndex = 0 if maxSpex != -1: specialization = specializations[maxIndex] while pubIndex < len(publicationArray): paperspecList = publicationArray[pubIndex][4] if not specialization in paperspecList: print publicationArray[pubIndex], "deleted!" del publicationArray[pubIndex] else: pubIndex += 1 else: print "maxSpex:", maxSpex, "and len(publicationArray):", len( publicationArray) print "Specialization after: " + specialization print "publicationArray After:", publicationArray # with open('test_publicationArray', 'wb') as f: # f.write(str(publicationArray)) # f.closed publicationsSorted = [] for publication in publicationArray: keywords = publication[3] for keyword in keywords: keywordIndex = -1 for publicationSortedIndex in range( len(publicationsSorted)): if publicationsSorted[publicationSortedIndex][ 0] == keyword: keywordIndex = publicationSortedIndex break if keywordIndex != -1: publicationsSorted[keywordIndex][1] += 1 publicationsSorted[keywordIndex][2].append(publication) else: publicationsSorted.append([keyword, 1, [publication]]) publicationsSorted.sort(key=itemgetter(1), reverse=True) publicationsPicked = [] print "publicationsSorted:", publicationsSorted # with open('test_publicationsSorted', 'wb') as f: # f.write(str(publicationsSorted)) # f.closed if len(locations) != 0: location = locations[0] else: location = "" if len(affiliations) != 0: affiliation = affiliations[0] else: affiliation = "" # Return the result array. returnValue = [ firstName, lastName, email, specialization, ideasURL, affiliation, location, homepage ] affiliationReturnValue = [firstName, lastName, email] for affiliationIndex in range(len(affiliations)): affiliationReturnValue.append(affiliations[affiliationIndex]) if affiliationIndex < len(locations): location = locations[affiliationIndex] else: location = "" affiliationReturnValue.append(location) for publicationSorted in publicationsSorted: if len(publicationsPicked) < 7: publicationItemTitle = publicationSorted[2][0][0] publicationKeywordIndex = 0 while publicationItemTitle in publicationsPicked and publicationKeywordIndex < len( publicationSorted[2]) - 1: publicationKeywordIndex += 1 publicationItemTitle = publicationSorted[2][ publicationKeywordIndex][0] if publicationItemTitle in publicationsPicked: continue else: publicationsPicked.append( publicationSorted[2][publicationKeywordIndex][0]) returnValue.append( publicationSorted[2][publicationKeywordIndex][0]) returnValue.append( publicationSorted[2][publicationKeywordIndex][1]) returnValue.append( publicationSorted[2][publicationKeywordIndex][2]) returnValue.append(publicationSorted[0]) else: break print "returnValue:", returnValue print "affiliationReturnValue:", affiliationReturnValue # foundTheLastSubject = False # if firstName == endFirstname and lastName == endLastname: # foundTheLastSubject = True return returnValue, affiliationReturnValue