Python soupStructure示例

编程语言: Python

命名空间/包名称: WebsiteScapingLibrary

方法/功能: soupStructure

hotexamples.com的示例: 8

Python soupStructure - 已找到8个示例。这些是从开源项目中提取的最受好评的WebsiteScapingLibrary.soupStructure现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

def classFinder(url, articleEncodedTitle, desiredCategory):

    qualityClass = "No-Class"
    importanceClass = "No-Class"
    desiredCategoryFound = False

    print("Finding class for url: " + url)
    if "Category:" in url or "User:"******"Talk:" in url or "User_talk:" in url or "Book:" in url:
        return "No-Class", "No-Class", False

    # Find the talk pahe hyperlink.
    talkPageHyperlink = 'http://en.wikipedia.org/wiki/Talk:' + articleEncodedTitle

    # Convert the hyperlink to absolute if it is relative.
    # talkPageHyperlink = urljoin(url, talkPageHyperlink)

    # Retrieve url content, and convert it into the beautifulsoup structure.
    talkSoup = soupStructure(talkPageHyperlink)
    # talkSoup = scrapeWebsite(talkPageHyperlink, 'div', 'id', "catlinks", "b", "text", "Wikipedia does not have a")

    errorCounter = 0
    while (talkSoup == '' or
           (talkSoup.find('div', id="catlinks") == None
            and talkSoup.find('b', text="Wikipedia does not have a")
            == None)) and errorCounter <= 10:

        # print "talkPageHyperlink:", talkPageHyperlink, "is not found. Please enter a new one:"
        # talkPageHyperlink = raw_input()

        talkSoup = soupStructure(talkPageHyperlink)

        errorCounter += 1

    while True:
        try:
            r = requests.get(
                'https://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions&titles='
                + articleEncodedTitle)
            r = r.json()
            query = r['query']
            pages = query['pages']
            break
        except Exception, e:
            print(
                "\n\n\nI cannot retrieve the revision ID of this article! Error Message: "
                + str(e))
            time.sleep(1)

示例#2

显示文件

文件： WikipediaScrapingLibrary.py 项目： jingcao33/1CademyBots

def WikipediaStatsGenerator(recommendationsFile):

    previousURLs = []

    with open(recommendationsFile + '.csv', 'rb') as fr:
        reader = csv.reader(fr)

        with open(recommendationsFile + '_Stats.csv', 'wb') as fw:
            writer = csv.writer(fw)

            pubResultRow = [
                'ID', 'Title', 'Edit Protection Level', 'Class', 'Importance',
                'Page Length', '# watchers', 'Time of Last Edit',
                '# redirects to this page', 'Page Creation Date',
                'Total # edits', 'Recent # edits (within past 30 days)',
                'Recent # distinct authors', '# views (last 90 days)',
                'Total # references', '# references published after 2010',
                '# External Hyperlinks'
            ]
            writer.writerow(pubResultRow)

            header = next(reader)
            for row in reader:
                for i in range(2, len(row) - 1, 3):
                    if row[i] != "" and row[i + 1] != "":
                        wikipageTitle = row[i]
                        print("Wikipage Title:", wikipageTitle)
                        wikipageURL = row[i + 1]
                        print("Wikipage URL:", wikipageURL)
                        wikipediaSoup = soupStructure(wikipageURL)
                        print("Soup is Retrieved.")

                        if wikipediaSoup != "" and not wikipageURL in previousURLs:
                            pubResultRow = WikipediaPageStats(
                                wikipageURL, wikipediaSoup, wikipageTitle,
                                'Econ')

                            trialNum = 0
                            while pubResultRow == [] and trialNum < 10:
                                trialNum += 1
                                # print("wikipageURL:", wikipageURL, "is not found. Please enter a new one:")
                                # wikipageURL = input()
                                pubResultRow = WikipediaPageStats(
                                    wikipageURL, wikipediaSoup, wikipageTitle,
                                    'Econ')

                            print(pubResultRow)
                            writer.writerow(pubResultRow)
                            previousURLs.append(wikipageURL)

示例#3

显示文件

文件： WikipediaScrapingLibrary.py 项目： jingcao33/1CademyBots

def classFinder(url, articleEncodedTitle, desiredCategory):

    qualityClass = "No-Class"
    importanceClass = "No-Class"
    desiredCategoryFound = False

    print("Finding class for url: " + url)
    if "Category:" in url or "User:"******"Talk:" in url or "User_talk:" in url or "Book:" in url:
        return "No-Class", "No-Class", False

    talkPageHyperlink = 'http://en.wikipedia.org/wiki/Talk:' + articleEncodedTitle

    # Convert the hyperlink to absolute if it is relative.
    # talkPageHyperlink = urljoin(url, talkPageHyperlink)

    talkSoup = soupStructure(talkPageHyperlink)
    # talkSoup = scrapeWebsite(talkPageHyperlink, 'div', 'id', "catlinks", "b", "text", "Wikipedia does not have a")

    errorCounter = 0
    while (talkSoup == '' or
           (talkSoup.find('div', id="catlinks") == None
            and talkSoup.find('b', text="Wikipedia does not have a")
            == None)) and errorCounter <= 10:

        # print("talkPageHyperlink:", talkPageHyperlink, "is not found. Please enter a new one:")
        # talkPageHyperlink = input()
        talkSoup = soupStructure(talkPageHyperlink)

        errorCounter += 1

    while True:
        try:
            r = requests.get(
                'https://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions&titles='
                + articleEncodedTitle)
            r = r.json()
            query = r['query']
            pages = query['pages']
            break
        except (e):
            print(
                "\n\n\nI cannot retrieve the revision ID of this article! Error Message: "
                + str(e))
            time.sleep(1)
    try:
        revisions = pages.values()[0]['revisions']
        revisionID = revisions[0]['revid']
    except:
        return "No-Class", "No-Class", False
    qualityDict = None
    while True:
        try:
            r = requests.get('https://ores.wmflabs.org/scores/enwiki/wp10/' +
                             str(revisionID))
            print("ores.wmflabs.org responded: " + str(r))
            r = r.json()
            qualityDict = r[str(revisionID)]
            innerIndex = 0
            while 'prediction' not in qualityDict and innerIndex < 4:
                qualityDict = qualityDict['score']
                innerIndex = innerIndex + 1
                print("Found score in qualityDict for the " + str(innerIndex) +
                      " time.")
            qualityClass = qualityDict['prediction']
            break
        except (e):
            print(
                "\n\n\nI cannot retrieve the class of this article from the API! Error Message: "
                + str(e))
            time.sleep(1)

    probabilitiesOfEachClass = qualityDict['probability']
    FAClassProbability = probabilitiesOfEachClass['FA']
    GAClassProbability = probabilitiesOfEachClass['GA']
    BClassProbability = probabilitiesOfEachClass['B']
    CClassProbability = probabilitiesOfEachClass['C']
    StartClassProbability = probabilitiesOfEachClass['Start']
    StubClassProbability = probabilitiesOfEachClass['Stub']

    weightedAverage = (
        float(StubClassProbability) + 2 * float(StartClassProbability) +
        3 * float(CClassProbability) + 4 * float(BClassProbability) +
        5 * float(GAClassProbability) + 6 * float(FAClassProbability)) / (
            float(StubClassProbability) + float(StartClassProbability) +
            float(CClassProbability) + float(BClassProbability) +
            float(GAClassProbability) + float(FAClassProbability))

    if weightedAverage > 5:
        qualityClass = "FA-Class"
    elif weightedAverage > 4:
        qualityClass = "GA-Class"
    elif weightedAverage > 3:
        qualityClass = "B-Class"
    elif weightedAverage > 2:
        qualityClass = "C-Class"
    elif weightedAverage > 1:
        qualityClass = "Start-Class"
    else:
        qualityClass = "Stub-Class"

    print("qualityClass: " + qualityClass)

    if talkSoup != '' and talkSoup.find(
            'div', id="catlinks") != None and talkSoup.find(
                'b', text="Wikipedia does not have a") == None:
        categoryDIVTag = talkSoup.find('div', id="catlinks")

        # if categoryDIVTag.find(text=re.compile('.*FA-Class.*')) != None:
        #     qualityClass = "FA-Class"

        # elif categoryDIVTag.find(text=re.compile('.* A-Class.*')) != None:
        #     qualityClass = "A-Class"

        # elif categoryDIVTag.find(text=re.compile('.*GA-Class.*')) != None:
        #     qualityClass = "GA-Class"

        # elif categoryDIVTag.find(text=re.compile('.*B+ class.*')) != None:
        #     qualityClass = "B+ class"

        # elif categoryDIVTag.find(text=re.compile('.*B-Class.*')) != None:
        #     qualityClass = "B-Class"

        # elif categoryDIVTag.find(text=re.compile('.*C-Class.*')) != None:
        #     qualityClass = "C-Class"

        # elif categoryDIVTag.find(text=re.compile('.*Stub-Class.*')) != None:
        #     qualityClass = "Stub-Class"

        if categoryDIVTag.find(text=re.compile('.*Top-importance.*')) != None:
            importanceClass = "Top-importance"

        elif categoryDIVTag.find(
                text=re.compile('.*High-importance.*')) != None:
            importanceClass = "High-importance"

        elif categoryDIVTag.find(
                text=re.compile('.*Mid-importance.*')) != None:
            importanceClass = "Mid-importancee"

        elif categoryDIVTag.find(
                text=re.compile('.*Low-importance.*')) != None:
            importanceClass = "Low-importance"

        elif categoryDIVTag.find(text=re.compile('.*NA-importance.*')) != None:
            importanceClass = "NA-importance"

        elif categoryDIVTag.find(
                text=re.compile('.*Unknown-importance.*')) != None:
            importanceClass = "Unknown-importance"

        elif categoryDIVTag.find(
                text=re.compile('.*Bottom-importance.*')) != None:
            importanceClass = "Bottom-importance"

        if desiredCategory.lower() in categoryDIVTag.prettify().lower():
            desiredCategoryFound = True

    return qualityClass, importanceClass, desiredCategoryFound

示例#4

显示文件

文件： Google_Recommended_Wikipedia_Pages.py 项目： Normanwqn/1CademyBots

def IsWikipageAppropriate(title, hyperlink):

    if ("Category:" in title or "User:"******"Talk:" in title or "User talk:" in title or
            "Book:" in title):
        return False, None

    wikipediaSoup = soupStructure(hyperlink)
    print("Wikipedia page Soup is Retrieved.")

    trialNum = 0
    while wikipediaSoup == "" and trialNum < 10:
        trialNum += 1
        print("Wikipedia page:" + hyperlink +
              " Soup is not retreived. Please enter an appropriate URL:")
        # hyperlink = input()
        # if hyperlink == "1":
        #     return False, None
        wikipediaSoup = soupStructure(hyperlink)

    if wikipediaSoup != "":
        resultRow = WikipediaPageStats(
            hyperlink, wikipediaSoup, title, 'list')

        trialNum = 0
        while resultRow == [] and trialNum < 10:
            trialNum += 1
            print("wikipageURL:", hyperlink,
                  "is not found. Please enter a new one:")
            # hyperlink = input()
            # if hyperlink == "1":
            #     return False, None
            wikipediaSoup = soupStructure(hyperlink)

            innerTrialNum = 0
            while wikipediaSoup == "" and innerTrialNum < 10:
                innerTrialNum += 1
                print("Wikipedia page:" + hyperlink +
                      " Soup is not retreived. Please enter an appropriate URL:")
                # hyperlink = input()
                # if hyperlink == "1":
                #     return False, None
                wikipediaSoup = soupStructure(hyperlink)

            resultRow = WikipediaPageStats(
                hyperlink, wikipediaSoup, title, 'list')

        if resultRow == []:
            return False, None
        print("Wikipedia page Stats:", resultRow)

        # If the edit protection os the page is not None:
        if resultRow["editProtectionLevel"].lower() != "none":
            print("The Wikipedia page is edit protected. Do not recommend it.")
            return False, None
        if resultRow["qualityClass"].lower() == "stub-class":
            print("The Wikipedia page is a Stub. Do not recommend it.")
            return False, None
        # if resultRow[3].lower() == "b-class":
        #   print "The Wikipedia page is a B-Class. Do not recommend it."
        #   return False, None
        # if resultRow[3].lower() == "b+ class":
        #   print "The Wikipedia page is a B+ class. Do not recommend it."
        #   return False, None
        # if resultRow[3].lower() == "ga-class":
        #   print "The Wikipedia page is a GA-Class. Do not recommend it."
        #   return False, None
        # if resultRow[3].lower() == "a-class":
        #   print "The Wikipedia page is a A-Class. Do not recommend it."
        #   return False, None
        # if resultRow[3].lower() == "fa-class":
        #   print "The Wikipedia page is a FA-Class. Do not recommend it."
        #   return False, None
        if num(resultRow["viewsNum"]) < 1000:
            print(
                "The Wikipedia page has been viewed less than 1000 times. Do not recommend it.")
            return False, None

        print("The Wikipedia page is OK to recommend.")
        return True, resultRow

示例#5

显示文件

def IsWikipageAppropriate(title, hyperlink):

    if ("Category:" in title or "User:"******"Talk:" in title
            or "User talk:" in title or "Book:" in title
            or "Template:" in title):
        return False, None

    wikipediaSoup = soupStructure(hyperlink)
    print("Wikipedia page Soup is Retrieved.")

    trialNum = 0
    while wikipediaSoup == "" and trialNum < 10:
        trialNum += 1
        print("Wikipedia page:" + hyperlink +
              " Soup is not retreived. Please enter an appropriate URL:")
        # hyperlink = input()
        # if hyperlink == "1":
        #     return False, None
        wikipediaSoup = soupStructure(hyperlink)

    if wikipediaSoup != "":
        resultRow = WikipediaPageStats(hyperlink, wikipediaSoup, title, 'list')

        trialNum = 0
        while resultRow == [] and trialNum < 10:
            trialNum += 1
            print("wikipageURL:", hyperlink,
                  "is not found. Please enter a new one:")
            # hyperlink = input()
            # if hyperlink == "1":
            #     return False, None
            wikipediaSoup = soupStructure(hyperlink)

            innerTrialNum = 0
            while wikipediaSoup == "" and innerTrialNum < 10:
                innerTrialNum += 1
                print(
                    "Wikipedia page:" + hyperlink +
                    " Soup is not retreived. Please enter an appropriate URL:")
                # hyperlink = input()
                # if hyperlink == "1":
                #     return False, None
                wikipediaSoup = soupStructure(hyperlink)

            resultRow = WikipediaPageStats(hyperlink, wikipediaSoup, title,
                                           'list')

        if resultRow == []:
            return False, None
        print("Wikipedia page Stats:", resultRow)

        # If the edit protection os the page is not None:
        # if resultRow[2].lower() != "none":
        #     print("The Wikipedia page is edit protected. Do not recommend it.")
        #     return False, None
        if resultRow['qualityClass'].lower(
        ) == "stub-class" or resultRow['qualityClass'].lower() == "start":
            print("The Wikipedia page is a " + resultRow['qualityClass'] +
                  ". Do not recommend it.")
            return False, None
        if resultRow['importanceClass'].lower(
        ) != 'high-importance' and resultRow['importanceClass'].lower(
        ) != 'top-importance':
            print("The Wikipedia page is a " + resultRow['importanceClass'] +
                  ". Do not recommend it.")
            return False, None
        if num(resultRow['length']) < 1000:
            print(
                "The Wikipedia page has been viewed less than 1000 times. Do not recommend it."
            )
            return False, None

        print("The Wikipedia page is OK to recommend.")
        return True, resultRow

示例#6

显示文件

文件： Ideas_Repec_Crawler.py 项目： ImanYZ/ExpertIdeas

def econPapersCategoriesExtractor(ideasURL, startSpecialization,
                                  endSpecialization):

    with open('Ideas_Repec_Dataset.csv', 'wb') as fw:
        writer = csv.writer(fw)

        with open('Ideas_Repec_Affiliations.csv', 'wb') as fwAffiliations:
            writerAffiliations = csv.writer(fwAffiliations)

            resultRow = [
                'firstName', 'lastName', 'email', 'specialization',
                'EconPapers Profile', 'affiliation', 'location', 'homepage',
                'publication1', 'publicationYear1', 'citation1',
                'firstKeyword1', 'publication2', 'publicationYear2',
                'citation2', 'firstKeyword2', 'publication3',
                'publicationYear3', 'citation3', 'firstKeyword3',
                'publication4', 'publicationYear4', 'citation4',
                'firstKeyword4', 'publication5', 'publicationYear5',
                'citation5', 'firstKeyword5', 'publication6',
                'publicationYear6', 'citation6', 'firstKeyword6',
                'publication7', 'publicationYear7', 'citation7',
                'firstKeyword7'
            ]
            writer.writerow(resultRow)

            resultRowAffiliations = [
                'firstName', 'lastName', 'email', 'affiliation1', 'location1',
                'affiliation2', 'location2', 'affiliation3', 'location3',
                'affiliation4', 'location4', 'affiliation5', 'location5',
                'affiliation6', 'location6', 'affiliation7', 'location7'
            ]
            writerAffiliations.writerow(resultRowAffiliations)

            ideasSoup = soupStructure(ideasURL)

            # If the content of the page is returned in BeautifulSoup structure:
            if ideasSoup != '':

                # Find the main list.
                mainList = ideasSoup.body.find(
                    text=re.compile('.*Accounting & Auditing.*')).parent.parent

                # If there is a main list:
                if mainList != None:

                    # Set if the startSpecialization is observed.
                    startSpecializationObserved = False

                    # Find all li tags inside the main list.
                    liTags = mainList.findAll('li')

                    for i in range(len(liTags)):

                        # Find the hyperlink tag inside the list item.
                        aTag = liTags[i].find('a')

                        specialization = aTag.nextSibling[1:]

                        print str(specialization)

                        if specialization == startSpecialization:
                            startSpecializationObserved = True

                        if specialization != "All new papers" and specialization != "German Papers" and startSpecializationObserved:

                            econPapersAuthorListExtractor(
                                writer, writerAffiliations, specialization,
                                'https://ideas.repec.org' + aTag['href'])

                            # if foundTheLastSubject:
                            # 	return
                        if specialization == endSpecialization:
                            return

示例#7

显示文件

文件： Ideas_Repec_Crawler.py 项目： ImanYZ/ExpertIdeas

def econPapersAuthorListExtractor(writer, writerAffiliations, specialization,
                                  ideasURL):

    # Retrieve the url html page content and convert it into BeautifulSoup structure.
    ideasSoup = soupStructure(ideasURL)

    # If the content of the page is returned in BeautifulSoup structure:
    while ideasSoup == '':
        ideasURL = raw_input()
        ideasSoup = soupStructure(ideasURL)

    # while True:
    # 	try:

    # # Set if startFirstname is observed.
    # startFirstnameObserved = False

    # # Set if startLastname is observed.
    # startLastnameObserved = False

    # Find the main tables list.
    tablesList = ideasSoup.body.findAll('table')

    for tableIndex in range(2, len(tablesList)):

        # Find the main table.
        mainTable = tablesList[tableIndex]

        # If there is a main table:
        if mainTable != None:

            # Find all a tags inside the main table.
            aTags = mainTable.findAll('a')

            for i in range(len(aTags)):

                # if aTags[i].find(text=re.compile(startFirstname)) != None:
                #     startFirstnameObserved = True

                #     if aTags[i].find(text=re.compile(startLastname)) != None:
                #         startLastnameObserved = True

                # If there is a question mark after the hyperlinked name, it means there is something wrong with it, so it's better to just ignore it.
                # if aTags[i].parent.find(text=re.compile('.*[?].*')) == None and startFirstnameObserved and startLastnameObserved:
                if aTags[i].parent.find(text=re.compile('.*[?].*')) == None:

                    try:
                        # Define an array as the return variable.
                        returnVariable, affiliationReturnValue = econPapersProfileExtractor(
                            'https://ideas.repec.org' + aTags[i]['href'],
                            specialization)

                        writer.writerow(returnVariable)
                        writerAffiliations.writerow(affiliationReturnValue)

                        print str(returnVariable)
                    except:
                        pass
                    # if foundTheLastSubject:
                    # 	return foundTheLastSubject

        # 	break
        # except:
        # 	ideasURL = raw_input()
        # 	ideasSoup = soupStructure(ideasURL)
    return False

示例#8

显示文件

文件： Ideas_Repec_Crawler.py 项目： ImanYZ/ExpertIdeas

def econPapersProfileExtractor(ideasURL, specialization):

    returnValue = []
    affiliationReturnValue = []

    # Retrieve the url html page content and convert it into BeautifulSoup structure.
    ideasSoup = soupStructure(ideasURL)

    # If the content of the page is returned in BeautifulSoup structure:
    while ideasSoup == '':
        ideasURL = raw_input()
        ideasSoup = soupStructure(ideasURL)

    # while True:
    # 	try:

    # Find the main DIV.
    obfuscateScriptMainDIV = ideasSoup.body.find('div', id="main")

    # If there is a main DIV:
    if obfuscateScriptMainDIV != None:

        # Find the FirstName tag.
        firstNameTag = ideasSoup.body.find(text=re.compile('.*First Name:.*'))

        if firstNameTag == None:

            firstName = ''

        else:
            # firstNameTag = strip_tags(firstNameTag)
            # # Find the first name.
            # firstName = firstNameTag[15:]
            firstNameTag = firstNameTag.parent.parent.findNext('td')
            # Find the first name.
            firstName = strip_tags(
                unicode(firstNameTag.renderContents(), 'utf8'))

        # Find the LastName tag.
        lastNameTag = ideasSoup.body.find(text=re.compile('.*Last Name:.*'))

        if lastNameTag == None:

            lastName = ''

        else:
            # lastNameTag = strip_tags(lastNameTag)
            # # Find the last name.
            # lastName = lastNameTag[14:]
            lastNameTag = lastNameTag.parent.parent.findNext('td')
            # Find the first name.
            lastName = strip_tags(unicode(lastNameTag.renderContents(),
                                          'utf8'))

        locations = []

        # Find all the Location tags.
        locationTags = ideasSoup.body.findAll(text=re.compile('.*Location:.*'))

        for locationTag in locationTags:

            locationTag = strip_tags(locationTag)
            # Find the location.
            locations.append(locationTag[10:])

        affiliations = []

        # Find the Affiliation tag.
        affiliationTags = ideasSoup.body.find(
            'div', id='affiliation-body').findAll('h4')

        for affiliationTag in affiliationTags:

            # Find the affiliation.
            affiliation = strip_tags(
                unicode(affiliationTag.renderContents(), 'utf8'))

            affiliations.append(affiliation)

        # Find the Homepage tag.
        homepageParentTag = ideasSoup.body.find(
            text=re.compile('.*Homepage:.*'))

        if homepageParentTag == None:

            homepage = ''

        else:
            # Find the homepage tag.
            homepageTag = homepageParentTag.parent.parent.findNext(
                'td').findNext('a')

            if homepageParentTag == None:

                homepage = ''

            else:
                # Find the homdepage.
                homepage = homepageTag['href']

        browser.get(ideasURL)

        email = ''

        # Find the obfuscating email tag.
        emailTag = browser.find_element_by_xpath(
            "//*[@id='details-body']/table/tbody/tr[7]/td[2]")

        # If there is an email tag inside obfuscateScriptMainDIV:
        # if emailTag != None and emailTag.text != '[This author has chosen not to make the email address public]':
        if emailTag != None and not ' ' in emailTag.text:

            # Add the found email address in an appropriate format to returnVariable.
            email = emailTag.text

            print email

        # # Find the obfuscating script.
        # obfuscateScriptTag = obfuscateScriptMainDIV.find('div', id='details-body').find('span', {'data-liame2'})

        # # If there is a script tag inside obfuscateScriptMainDIV:
        # if obfuscateScriptTag != None:

        #     # Find the content of the obfuscating script.
        #     obfuscateScriptContent = obfuscateScriptTag.renderContents()

        #     # Find both email parts from the content of the obfuscating script.
        #     emailParts = re.findall("'([^',]+)'", hparser.unescape(obfuscateScriptContent))

        #     # Add the found email address in an appropriate format to returnVariable.
        #     returnVariable['email'] = liame2(emailParts)

        # Define publication array.
        publicationArray = []
        specializations = []
        specializationRepetition = []

        # Find the list of publications.
        publicationsList = browser.find_element_by_id(
            'works-group').find_elements_by_tag_name('li')

        # If there is any publication listed:
        if len(publicationsList) != 0:

            # For each publication:
            for publicationIndex in range(len(publicationsList)):

                # Extract the citation.
                citationText = convert_unicode(
                    publicationsList[publicationIndex].text)

                # If there are multiple versions of the citation, only take the first one.
                citationTextFirst = re.split(r'\s*\n+\s*', citationText)

                # Extract Publication title.
                publicationTitleTag = publicationsList[
                    publicationIndex].find_element_by_tag_name('a')
                publicationTitle = publicationTitleTag.text

                notUnicodeCharacter = True
                try:
                    print 'publicationTitle: ' + publicationTitle
                    print 'citationText: ' + citationTextFirst[0]

                except:
                    print "Unicode character found."
                    notUnicodeCharacter = False

                # Check it publicationTitle is in ASCII (all English characters) continue:
                if is_ascii(publicationTitle) and notUnicodeCharacter:

                    # Identify if this citation has been shown up previously.
                    isNotAnotherVersion = True

                    # For all citations listed previously:
                    for citationIndex in range(len(publicationArray)):

                        # if citation has been shown up previously:
                        if publicationTitle.lower(
                        ) in publicationArray[citationIndex][2].lower():

                            # Citation has been shown up previously.
                            isNotAnotherVersion = False

                            print citationText, "Is another version of", publicationArray[
                                citationIndex][2]
                            break

                    paperYearGroup = re.search("^\D+[,][ ](\d+)[.]",
                                               citationTextFirst[0])

                    if isNotAnotherVersion and paperYearGroup != None:

                        paperYear = paperYearGroup.group(1)

                        # Extract the first keyword of this paper from econpapers.
                        # firstKeyword = keywordExtractor("http://econpapers.repec.org/scripts/search.pf?ft=" + urllib.quote_plus(publicationTitle.encode('utf-8')))
                        keywords, paperspecList = keywordAndSpecializationExtractor(
                            publicationTitleTag.get_attribute("href"),
                            'http://econpapers.repec.org/scripts/search.pf?ft='
                            + urllib.quote_plus(
                                ('"' + publicationTitle + '" ' + firstName +
                                 " " + lastName).encode('utf-8')))

                        if keywords != None and keywords != [] and paperspecList != None and paperspecList != []:

                            print "keywords: ", keywords

                            # Append the results to the returning array.
                            publicationArray.append([
                                publicationTitle, paperYear,
                                citationTextFirst[0], keywords, paperspecList
                            ])

                            print "publicationArray: ", publicationArray

            publicationArray.sort(key=itemgetter(1), reverse=True)
            print "Sorted publicationArray: ", publicationArray

            specializationFound = False
            for publicationElement in publicationArray:
                paperspecList = publicationElement[4]
                if paperspecList != []:
                    for paperSpec in paperspecList:
                        if paperSpec in specializations:
                            specializationRepetition[specializations.index(
                                paperSpec)] += 1
                            if specializationRepetition[specializations.index(
                                    paperSpec)] == 7:
                                specializationFound = True
                                break
                        else:
                            specializations.append(paperSpec)
                            specializationRepetition.append(1)
                if specializationFound:
                    break
            print "specializations: ", specializations
            print "specializationRepetition: ", specializationRepetition

            maxSpex = -1
            maxIndex = -1
            for specIndex in range(len(specializationRepetition)):
                if specializationRepetition[specIndex] > maxSpex:
                    maxSpex = specializationRepetition[specIndex]
                    maxIndex = specIndex
            print "Specialization before:", specialization
            specialization = ""
            print publicationArray
            pubIndex = 0
            if maxSpex != -1:
                specialization = specializations[maxIndex]
                while pubIndex < len(publicationArray):
                    paperspecList = publicationArray[pubIndex][4]
                    if not specialization in paperspecList:
                        print publicationArray[pubIndex], "deleted!"
                        del publicationArray[pubIndex]
                    else:
                        pubIndex += 1
            else:
                print "maxSpex:", maxSpex, "and len(publicationArray):", len(
                    publicationArray)

            print "Specialization after: " + specialization
            print "publicationArray After:", publicationArray

            # with open('test_publicationArray', 'wb') as f:
            # 	f.write(str(publicationArray))
            # f.closed

            publicationsSorted = []

            for publication in publicationArray:
                keywords = publication[3]

                for keyword in keywords:
                    keywordIndex = -1
                    for publicationSortedIndex in range(
                            len(publicationsSorted)):
                        if publicationsSorted[publicationSortedIndex][
                                0] == keyword:
                            keywordIndex = publicationSortedIndex
                            break
                    if keywordIndex != -1:
                        publicationsSorted[keywordIndex][1] += 1
                        publicationsSorted[keywordIndex][2].append(publication)
                    else:
                        publicationsSorted.append([keyword, 1, [publication]])

            publicationsSorted.sort(key=itemgetter(1), reverse=True)

            publicationsPicked = []

            print "publicationsSorted:", publicationsSorted

            # with open('test_publicationsSorted', 'wb') as f:
            # 	f.write(str(publicationsSorted))
            # f.closed

            if len(locations) != 0:
                location = locations[0]
            else:
                location = ""

            if len(affiliations) != 0:
                affiliation = affiliations[0]
            else:
                affiliation = ""

            # Return the result array.
            returnValue = [
                firstName, lastName, email, specialization, ideasURL,
                affiliation, location, homepage
            ]
            affiliationReturnValue = [firstName, lastName, email]

            for affiliationIndex in range(len(affiliations)):
                affiliationReturnValue.append(affiliations[affiliationIndex])
                if affiliationIndex < len(locations):
                    location = locations[affiliationIndex]
                else:
                    location = ""
                affiliationReturnValue.append(location)

            for publicationSorted in publicationsSorted:
                if len(publicationsPicked) < 7:
                    publicationItemTitle = publicationSorted[2][0][0]

                    publicationKeywordIndex = 0

                    while publicationItemTitle in publicationsPicked and publicationKeywordIndex < len(
                            publicationSorted[2]) - 1:

                        publicationKeywordIndex += 1

                        publicationItemTitle = publicationSorted[2][
                            publicationKeywordIndex][0]

                    if publicationItemTitle in publicationsPicked:
                        continue
                    else:
                        publicationsPicked.append(
                            publicationSorted[2][publicationKeywordIndex][0])

                        returnValue.append(
                            publicationSorted[2][publicationKeywordIndex][0])
                        returnValue.append(
                            publicationSorted[2][publicationKeywordIndex][1])
                        returnValue.append(
                            publicationSorted[2][publicationKeywordIndex][2])
                        returnValue.append(publicationSorted[0])
                else:
                    break

            print "returnValue:", returnValue
            print "affiliationReturnValue:", affiliationReturnValue

            # foundTheLastSubject = False
            # if firstName == endFirstname and lastName == endLastname:
            # 	foundTheLastSubject = True

            return returnValue, affiliationReturnValue