Python RipPage示例，Classes.RipPage.RipPage Python示例

示例#1

0

显示文件

文件： UnigoLeadsOld.py 项目： kyajpauley/cerebro

    def getResultPageInfo(self):
        sponsor = CleanText.cleanALLtheText(self.driver.find_element_by_xpath(
            "//div/p/strong[text() = 'Awarded By']/../../following-sibling::div/p").get_attribute('textContent'))
        awardAmount = CleanText.cleanALLtheText(self.driver.find_element_by_xpath(
            "//div/p/strong[text() = 'Award Amount']/../../following-sibling::div/p").get_attribute('textContent'))
        recipients = CleanText.cleanALLtheText(self.driver.find_element_by_xpath(
            "//div/p/strong[text() = 'Recipients']/../../following-sibling::div/p").get_attribute('textContent'))
        requirements = CleanText.cleanALLtheText(self.driver.find_element_by_xpath(
            "//div/p/strong[text() = 'Requirements']/../../following-sibling::div").get_attribute('textContent'))
        additionalInfo = CleanText.cleanALLtheText(self.driver.find_element_by_xpath(
            "//div/p/strong[text() = 'Additional Information']/../../following-sibling::div/p").get_attribute(
            'textContent'))
        contact = CleanText.cleanALLtheText(self.driver.find_element_by_xpath(
            "//div/p/strong[text() = 'Contact']/../../following-sibling::div/p").get_attribute('textContent'))
        address = CleanText.cleanALLtheText(self.driver.find_element_by_xpath(
            "//div/p/strong[text() = 'Address']/../../following-sibling::div").get_attribute('textContent'))
        if self.checkIfElementExists("//a[@class='button secondary']"):
            sourceWebsite = self.driver.find_element_by_xpath("//a[@class='button secondary']").get_attribute('href')
            sourceText = CleanText.cleanALLtheText(RipPage.getPageSource(sourceWebsite))
        else:
            sourceWebsite = ''
            sourceText = ''

        resultPageArray = [sponsor, awardAmount, recipients, requirements, additionalInfo, contact, address,
                           sourceWebsite, sourceText]
        return resultPageArray

示例#2

0

显示文件

 def __init__(self, url):
     self.url = url
     self.htmlSource = RipPage.getPageSource(self.url)
     self.title = ''
     self.pageurl = ''
     self.allurlsonpage = []
     self.description = ''

示例#3

0

显示文件

文件： CollegeGreenLightLeads.py 项目： kyajmiller/Cerebro

    def getLeads(self):
        arrayOfTitleLinkDivs = self.driver.find_elements_by_xpath("//td[@class='scholarshipNameColumn']/div/a")
        arrayOfAmountDivs = self.driver.find_elements_by_xpath("//td[@class='amount']")
        arrayOfDeadlineDivs = self.driver.find_elements_by_xpath("//td[@class='deadline']")

        titlesList = self.getTitlesList(arrayOfTitleLinkDivs)
        linksList = self.getLinksList(arrayOfTitleLinkDivs)
        amountsList = self.getAmountsList(arrayOfAmountDivs)
        deadlinesList = self.getDeadlinesList(arrayOfDeadlineDivs)

        for i in range(len(titlesList)):
            title = CleanText.cleanALLtheText(titlesList[i])
            resultPageLink = linksList[i]
            amount = CleanText.cleanALLtheText(amountsList[i])
            deadline = deadlinesList[i]

            resultPageInfo = self.goToResultPageAndPullInformation(resultPageLink)
            sponsor = CleanText.cleanALLtheText(resultPageInfo[0])
            sourceWebsite = resultPageInfo[1]
            description = CleanText.cleanALLtheText(resultPageInfo[2])
            requirements = CleanText.cleanALLtheText(resultPageInfo[3])

            sourceText = ''
            if re.search('^https?://', sourceWebsite):
                sourceText = CleanText.cleanALLtheText(RipPage.getPageSource(sourceWebsite))

            leadArray = [title, amount, deadline, sponsor, description, requirements, resultPageLink, sourceWebsite,
                         sourceText]
            self.collegeGreenLightLeadsArrays.append(leadArray)

        self.driver.quit()
        return self.collegeGreenLightLeadsArrays

示例#4

0

显示文件

    def makeLeadArrayAndAddToGrantForwardLeads(self, singleResultArray):
        name = CleanText.cleanALLtheText(singleResultArray[0])
        url = singleResultArray[1]
        resultPageInfo = self.goToResultPageAndPullInformation(url)

        keyword = CleanText.cleanALLtheText(self.searchTerm)
        abstract = CleanText.cleanALLtheText(resultPageInfo[6])
        sponsor = CleanText.cleanALLtheText(resultPageInfo[1])
        amount = CleanText.cleanALLtheText(resultPageInfo[2])
        applicantType = CleanText.cleanALLtheText(resultPageInfo[3])
        citizenshipResidency = CleanText.cleanALLtheText(resultPageInfo[4])
        activityLocation = CleanText.cleanALLtheText(resultPageInfo[5])
        eligibility = CleanText.cleanALLtheText(resultPageInfo[7])
        categories = CleanText.cleanALLtheText(resultPageInfo[8])
        sourceWebsite = resultPageInfo[0]
        sourceText = CleanText.cleanALLtheText(
            RipPage.getPageSource(sourceWebsite))

        singleLeadArray = [
            keyword, url, name, abstract, sponsor, amount, applicantType,
            citizenshipResidency, activityLocation, eligibility, categories,
            sourceWebsite, sourceText
        ]

        self.arrayOfPivotLeads.append(singleLeadArray)

示例#5

0

显示文件

文件： TrafficSafetyStoreLeads.py 项目： kyajmiller/Cerebro

    def getSourceWebsitesAndSourceTexts(self):
        sourceWebsiteDivs = self.driver.find_elements_by_xpath("//div[@class='col-xs-8 col-xs-offset-2']/a")
        sourceWebsitesList = [sourceWebsiteDiv.get_attribute('href') for sourceWebsiteDiv in sourceWebsiteDivs]

        sourceTextsList = [RipPage.getPageSource(sourceWebsite) for sourceWebsite in sourceWebsitesList]

        sourceTextsList = [CleanText.cleanALLtheText(sourceText) for sourceText in sourceTextsList]

        return sourceWebsitesList, sourceTextsList

示例#6

0

显示文件

文件： GrantForwardLeads.py 项目： kyajmiller/Cerebro

    def goToResultPageAndPullInformation(self, resultPageLink):
        self.driver.get(resultPageLink)
        self.driver.implicitly_wait(2)
        description = ''
        sponsor = ''
        amount = ''
        eligibility = ''
        submissionInfo = ''
        categories = ''
        sourceWebsite = ''
        sourceText = ''
        deadline = ''

        if self.checkIfElementExists("//div[@id = 'field-description']/div[@class = 'content-collapsed']"):
            description = self.driver.find_element_by_xpath(
                "//div[@id = 'field-description']/div[@class = 'content-collapsed']").get_attribute('textContent')
            description = CleanText.cleanALLtheText(description)

        if self.checkIfElementExists("//div[@class = 'sponsor-content']/div/a"):
            sponsor = self.driver.find_element_by_xpath("//div[@class = 'sponsor-content']/div/a").get_attribute(
                'textContent')
            sponsor = CleanText.cleanALLtheText(sponsor)

        if self.checkIfElementExists("//div[@id = 'field-amount_info']/div[@class = 'content-collapsed']"):
            amount = self.driver.find_element_by_xpath(
                "//div[@id = 'field-amount_info']/div[@class = 'content-collapsed']").get_attribute('textContent')
            amount = CleanText.cleanALLtheText(amount)

        if self.checkIfElementExists("//div[@id = 'field-eligibility']/div[@class = 'content-collapsed']"):
            eligibility = self.driver.find_element_by_xpath(
                "//div[@id = 'field-eligibility']/div[@class = 'content-collapsed']").get_attribute('textContent')
            eligibility = CleanText.cleanALLtheText(eligibility)

        if self.checkIfElementExists("//div[@id = 'field-submission_info']/div[@class = 'content-collapsed']"):
            submissionInfo = self.driver.find_element_by_xpath(
                "//div[@id = 'field-submission_info']/div[@class = 'content-collapsed']").get_attribute('textContent')
            submissionInfo = CleanText.cleanALLtheText(submissionInfo)

        if self.checkIfElementExists("//div[@id = 'field-subjects']/ul"):
            categories = self.driver.find_element_by_xpath("//div[@id = 'field-subjects']/ul").get_attribute(
                'textContent')
            categories = CleanText.cleanALLtheText(categories)

        if self.checkIfElementExists("//a[@class = 'source-link btn btn-warning']"):
            sourceWebsite = self.driver.find_element_by_xpath(
                "//a[@class = 'source-link btn btn-warning']").get_attribute('href')
            sourceText = CleanText.cleanALLtheText(RipPage.getPageSource(sourceWebsite))

        if self.checkIfElementExists("//div[@class='table-responsive deadline-tables']/table/tbody"):
            deadline = self.driver.find_element_by_xpath(
                "//div[@class='table-responsive deadline-tables']/table/tbody").get_attribute('textContent')
            deadline = CleanText.cleanALLtheText(deadline)

        resultPageInfo = [description, sponsor, amount, eligibility, submissionInfo, categories, sourceWebsite,
                          sourceText, deadline]
        return resultPageInfo

示例#7

0

显示文件

文件： TeacherDotOrgLeads.py 项目： kyajmiller/Cerebro

    def getLeads(self):
        titleDivs = self.driver.find_elements_by_xpath("//h3[not(ancestor::div[@id='scholarship_intro_859'])]")

        for i in range(len(titleDivs)):
            title = titleDivs[i].get_attribute('textContent')
            requirements = ''
            sourceWebsite = ''
            description = ''

            if title != 'Quick Links' and title != 'About Us':
                if i == 0:
                    description = self.driver.find_element_by_xpath("//div[@class='intro']/p").get_attribute(
                        'textContent')
                    sourceWebsite = self.driver.find_element_by_xpath("//div[@class='intro']/p/a").get_attribute('href')
                    requirements = self.driver.find_element_by_xpath(
                            "//div[@class='intro']/following-sibling::*[1][self::ul]").get_attribute('textContent')
                else:
                    j = i + 1
                    if self.checkIfElementExists(
                                "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[1]" % j):
                        description = self.driver.find_element_by_xpath(
                            "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[1]" % j).get_attribute(
                            'textContent')
                    if self.checkIfElementExists(
                                "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]" % j):
                        requirements = self.driver.find_element_by_xpath(
                            "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]" % j).get_attribute(
                                'textContent')

                    if self.checkIfElementExists(
                                "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[1]/a" % j):
                        sourceWebsite = self.driver.find_element_by_xpath(
                                "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[1]/a" % j).get_attribute(
                                'href')
                    elif self.checkIfElementExists(
                                "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]" % j):
                        if self.checkIfElementExists(
                                    "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]/a" % j):
                            sourceWebsite = self.driver.find_element_by_xpath(
                                    "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]/a" % j).get_attribute(
                                    'href')

                sourceText = RipPage.getPageSource(sourceWebsite)

                title = CleanText.cleanALLtheText(title)
                description = CleanText.cleanALLtheText(description)
                requirements = CleanText.cleanALLtheText(requirements)
                sourceText = CleanText.cleanALLtheText(sourceText)

                leadArray = [title, description, requirements, sourceWebsite, sourceText]

                self.teacherDotOrgLeadArrays.append(leadArray)
        self.driver.close()
        return self.teacherDotOrgLeadArrays

示例#8

0

显示文件

文件： UnigoLeads.py 项目： kyajpauley/cerebro

    def getResultPageInfo(self):
        url = self.driver.current_url
        sponsor = ''
        awardAmount = ''
        recipients = ''
        requirements = ''
        additionalInfo = ''
        contact = ''
        address = ''
        deadlineInformation = ''

        if self.checkIfElementExists("//div/p/strong[text() = 'Awarded By']/../../following-sibling::div/p"):
            sponsor = CleanText.cleanALLtheText(self.driver.find_element_by_xpath(
                    "//div/p/strong[text() = 'Awarded By']/../../following-sibling::div/p").get_attribute(
                'textContent'))
            sponsor = re.sub('» More Info', '', sponsor)
        if self.checkIfElementExists("//div/p/strong[text() = 'Award Amount']/../../following-sibling::div/p"):
            awardAmount = CleanText.cleanALLtheText(self.driver.find_element_by_xpath(
                    "//div/p/strong[text() = 'Award Amount']/../../following-sibling::div/p").get_attribute(
                'textContent'))
        if self.checkIfElementExists("//div/p/strong[text() = 'Recipients']/../../following-sibling::div/p"):
            recipients = CleanText.cleanALLtheText(self.driver.find_element_by_xpath(
                    "//div/p/strong[text() = 'Recipients']/../../following-sibling::div/p").get_attribute(
                'textContent'))
        if self.checkIfElementExists("//div/p/strong[text() = 'Requirements']/../../following-sibling::div"):
            requirements = CleanText.cleanALLtheText(self.driver.find_element_by_xpath(
                    "//div/p/strong[text() = 'Requirements']/../../following-sibling::div").get_attribute(
                'textContent'))
        if self.checkIfElementExists(
                "//div/p/strong[text() = 'Additional Information']/../../following-sibling::div/p"):
            additionalInfo = CleanText.cleanALLtheText(self.driver.find_element_by_xpath(
                    "//div/p/strong[text() = 'Additional Information']/../../following-sibling::div/p").get_attribute(
                    'textContent'))
        if self.checkIfElementExists("//div/p/strong[text() = 'Contact']/../../following-sibling::div/p"):
            contact = CleanText.cleanALLtheText(self.driver.find_element_by_xpath(
                    "//div/p/strong[text() = 'Contact']/../../following-sibling::div/p").get_attribute('textContent'))
        if self.checkIfElementExists("//div/p/strong[text() = 'Address']/../../following-sibling::div"):
            address = CleanText.cleanALLtheText(self.driver.find_element_by_xpath(
                    "//div/p/strong[text() = 'Address']/../../following-sibling::div").get_attribute('textContent'))
        if self.checkIfElementExists(
                "//strong[text() ='Deadline Information']/following-sibling::span[@class='smalltext']"):
            deadlineInformation = CleanText.cleanALLtheText(self.driver.find_element_by_xpath(
                "//strong[text() ='Deadline Information']/following-sibling::span[@class='smalltext']").get_attribute(
                'textContent'))
        if self.checkIfElementExists("//a[@class='button cta']"):
            sourceWebsite = self.driver.find_element_by_xpath("//a[@class='button cta']").get_attribute('href')
            sourceText = CleanText.cleanALLtheText(RipPage.getPageSource(sourceWebsite))
        else:
            sourceWebsite = ''
            sourceText = ''

        resultPageArray = [url, sponsor, awardAmount, recipients, requirements, additionalInfo, contact, address,
                           deadlineInformation, sourceWebsite, sourceText]
        return resultPageArray

示例#9

0

显示文件

文件： MastersInEducationLeads.py 项目： kyajmiller/Cerebro

    def makeLeadArray(self, resultArray):
        title = resultArray[0]
        amount = resultArray[1]
        deadline = resultArray[2]
        description = resultArray[3]
        sourceWebsite = resultArray[4]
        sourceText = RipPage.getPageSource(sourceWebsite)

        sourceText = CleanText.cleanALLtheText(sourceText)

        mastersInEducationLeadArray = [title, amount, deadline, description, sourceWebsite, sourceText]
        self.mastersInEducationLeadsArrays.append(mastersInEducationLeadArray)

示例#10

0

显示文件

文件： MastersInEducationLeads.py 项目： kyajpauley/cerebro

    def makeLeadArray(self, resultArray):
        title = resultArray[0]
        amount = resultArray[1]
        deadline = resultArray[2]
        description = resultArray[3]
        sourceWebsite = resultArray[4]
        sourceText = RipPage.getPageSource(sourceWebsite)

        sourceText = CleanText.cleanALLtheText(sourceText)

        mastersInEducationLeadArray = [
            title, amount, deadline, description, sourceWebsite, sourceText
        ]
        self.mastersInEducationLeadsArrays.append(mastersInEducationLeadArray)

示例#11

0

显示文件

文件： PopulateEmptyLinkBodyUsingDatabaseLinkUrl.py 项目： kyajpauley/cerebro

    def __init__(self):
        self.db = SUDBConnect()
        self.linkUrlsList = []

        rowsWithEmptyLinkBody = self.db.getRowsDB("select * from dbo.LinkCrawlerHrefs where ISNULL(LinkBody, '') = ''")
        if len(rowsWithEmptyLinkBody) >= 1:
            for row in rowsWithEmptyLinkBody:
                self.linkUrlsList.append(row.LinkUrl)

        if len(self.linkUrlsList) >= 1:
            for link in self.linkUrlsList:
                linkbody = RipPage.getPageSource(link)
                cleanLinkBody = CleanText.cleanALLtheText(linkbody)
                self.db.insertUpdateOrDeleteDB(
                    "UPDATE dbo.LinkCrawlerHrefs SET LinkBody='" + cleanLinkBody + "' WHERE LinkUrl='" + link + "'")

示例#12

0

显示文件

文件： GoogleLeadsUpdateEmptyLinkBody.py 项目： kyajpauley/cerebro

    def __init__(self):
        self.db = SUDBConnect()
        self.listOfEmptyLinkBodyLinks = []

        rowsWithEmptyLinkBody = self.db.getRowsDB("select * from dbo.GoogleLeads where ISNULL(LinkBody, '') = ''")
        if len(rowsWithEmptyLinkBody) >= 1:
            for row in rowsWithEmptyLinkBody:
                self.listOfEmptyLinkBodyLinks.append(row.Link)

        if len(self.listOfEmptyLinkBodyLinks) >= 1:
            for link in self.listOfEmptyLinkBodyLinks:
                linkbody = RipPage.getPageSource(link)
                linkbody = CleanText.cleanALLtheText(linkbody)
                self.db.insertUpdateOrDeleteDB(
                    "update dbo.GoogleLeads set LinkBody='" + linkbody + "', DateBodyGenerated=GETDATE() where Link='" + link + "'")

示例#13

0

显示文件

文件： FastWebLeads.py 项目： kyajpauley/cerebro

    def getInfoFromScholarshipPage(self, url):
        self.driver.get(url)
        self.driver.implicitly_wait(2)

        description = ''
        awardType = ''
        numAwards = ''
        majors = ''
        additionalInfo = ''
        sourceWebsite = ''
        sourceText = ''

        if self.checkIfElementExists("//div[@class='description']"):
            description = self.driver.find_element_by_xpath("//div[@class='description']").get_attribute('textContent')
            description = CleanText.cleanALLtheText(description)

        if self.checkIfElementExists("//p[text() = 'Award Type: ']/following-sibling::p[@class='data']"):
            awardType = self.driver.find_element_by_xpath(
                "//p[text() = 'Award Type: ']/following-sibling::p[@class='data']").get_attribute('textContent')
            awardType = CleanText.cleanALLtheText(awardType)

        if self.checkIfElementExists("//p[text() = 'Awards Available: ']/following-sibling::p[@class='data']"):
            numAwards = self.driver.find_element_by_xpath(
                "//p[text() = 'Awards Available: ']/following-sibling::p[@class='data']").get_attribute('textContent')
            numAwards = CleanText.cleanALLtheText(numAwards)

        if self.checkIfElementExists("//p[text() = 'Fields of Study: ']/following-sibling::p[@class='data major']"):
            majors = self.driver.find_element_by_xpath(
                "//p[text() = 'Fields of Study: ']/following-sibling::p[@class='data major']").get_attribute(
                'textContent')
            majors = re.sub('All Fields of Study', '', majors)
            majors = CleanText.cleanALLtheText(majors)

        if self.checkIfElementExists("//p[text() = 'Additional Info: ']/following-sibling::p[@class='data major']"):
            additionalInfo = self.driver.find_element_by_xpath(
                "//p[text() = 'Additional Info: ']/following-sibling::p[@class='data major']").get_attribute(
                'textContent')
            additionalInfo = CleanText.cleanALLtheText(additionalInfo)

        if self.checkIfElementExists("//p[text() = 'Website: ']/following-sibling::p[@class='data']/a"):
            sourceWebsite = self.driver.find_element_by_xpath(
                "//p[text() = 'Website: ']/following-sibling::p[@class='data']/a").get_attribute('href')
            sourceText = RipPage.getPageSource(sourceWebsite)
            sourceText = CleanText.cleanALLtheText(sourceText)

        scholarshipPageInfoArray = [description, awardType, numAwards, majors, additionalInfo, sourceWebsite,
                                    sourceText]
        return scholarshipPageInfoArray

示例#14

0

显示文件

文件： CheggLeads.py 项目： kyajmiller/Cerebro

    def getInfoFromScholarshipPage(self, link):
        self.driver.get(link)
        self.driver.implicitly_wait(2)

        findBadButton = self.driver.find_elements_by_xpath(
            "//button[@class='btn-primary-sm save-profile chgsec_hostedsc-apply-ApplyNowButton chgser_sc']")

        if findBadButton == []:
            eligibility = ''
            applicationOverview = ''
            description = ''
            sponsor = ''
            sourceWebsite = ''
            sourceText = ''

            if self.checkIfElementExists("//span[@class='txt-3']"):
                eligibility = self.driver.find_element_by_xpath("//span[@class='txt-3']").get_attribute('textContent')

            if self.checkIfElementExists(
                    "//h3[text() = 'Application Overview']/following-sibling::div[@class='txt-3']"):
                applicationOverview = self.driver.find_element_by_xpath(
                    "//h3[text() = 'Application Overview']/following-sibling::div[@class='txt-3']").get_attribute(
                    'textContent')

            if self.checkIfElementExists("//h3[text() = 'Purpose']/following-sibling::div[@class='txt-3']"):
                description = self.driver.find_element_by_xpath(
                    "//h3[text() = 'Purpose']/following-sibling::div[@class='txt-3']").get_attribute('textContent')

            if self.checkIfElementExists(
                    "//h3[text() = 'Provider Organization']/following-sibling::div[@class='txt-3'][1]"):
                sponsor = self.driver.find_element_by_xpath(
                    "//h3[text() = 'Provider Organization']/following-sibling::div[@class='txt-3'][1]").get_attribute(
                    'textContent')

            if self.checkIfElementExists("//button[@class='btn-primary-sm go-apply']"):
                sourceWebsite = self.driver.find_element_by_xpath(
                    "//button[@class='btn-primary-sm go-apply']").get_attribute('url')
                sourceText = RipPage.getPageSource(sourceWebsite)

            resultPageInfoArray = [eligibility, applicationOverview, description, sponsor, sourceText]
            resultPageInfoArray = [CleanText.cleanALLtheText(item) for item in resultPageInfoArray]
            resultPageInfoArray.append(sourceWebsite)

            return resultPageInfoArray
        else:
            return None

示例#15

0

显示文件

文件： Scholarships360Leads.py 项目： kyajpauley/cerebro

    def getInfoFromScholarshipPage(self, url):
        self.driver.get(url)
        self.driver.implicitly_wait(2)

        description = ''
        eligibility = ''
        amountInfo = ''
        deadlineInfo = ''
        sourceWebsite = ''
        sourceText = ''

        if self.checkIfElementExists("//div[@class='entry-content']/p[1]"):
            description = self.driver.find_element_by_xpath("//div[@class='entry-content']/p[1]").get_attribute(
                'textContent')
            description = CleanText.cleanALLtheText(description)

        if self.checkIfElementExists(
                "//div[@class='entry-content']/p/strong[text() = 'Who is eligible to apply?']/../following-sibling::ul[1]"):
            eligibility = self.driver.find_element_by_xpath(
                "//div[@class='entry-content']/p/strong[text() = 'Who is eligible to apply?']/../following-sibling::ul[1]").get_attribute(
                'textContent')
            eligibility = CleanText.cleanALLtheText(eligibility)

        if self.checkIfElementExists(
                "//div[@class='entry-content']/p/strong[text() = 'How much is each scholarship worth?']/../following-sibling::p[1]"):
            amountInfo = self.driver.find_element_by_xpath(
                "//div[@class='entry-content']/p/strong[text() = 'How much is each scholarship worth?']/../following-sibling::p[1]").get_attribute(
                'textContent')
            amountInfo = CleanText.cleanALLtheText(amountInfo)

        if self.checkIfElementExists(
                "//div[@class='entry-content']/p/strong[text() = 'When is the deadline to apply?']/../following-sibling::ul[1]"):
            deadlineInfo = self.driver.find_element_by_xpath(
                "//div[@class='entry-content']/p/strong[text() = 'When is the deadline to apply?']/../following-sibling::ul[1]").get_attribute(
                'textContent')
            deadlineInfo = CleanText.cleanALLtheText(deadlineInfo)

        if self.checkIfElementExists("//span[@class='apply']/a"):
            sourceWebsite = self.driver.find_element_by_xpath("//span[@class='apply']/a").get_attribute('href')
            sourceText = RipPage.getPageSource(sourceWebsite)
            sourceText = CleanText.cleanALLtheText(sourceText)

        scholarshipPageInfoArray = [description, eligibility, amountInfo, deadlineInfo, sourceWebsite, sourceText]
        return scholarshipPageInfoArray

示例#16

0

显示文件

文件： FatomeiLeads.py 项目： kyajmiller/Cerebro

    def getLeads(self):
        arrayOfTitleLinkDivs = self.driver.find_elements_by_xpath(
            "//td[@class='f']/../preceding-sibling::tr[1]/td[@class='a']/a")
        arrayOfDateDescriptionDivs = self.driver.find_elements_by_xpath("//tr/td[@class='f']/../td")

        titlesList = self.getTitlesList(arrayOfTitleLinkDivs)
        linksList = self.getLinksList(arrayOfTitleLinkDivs)
        dueDatesList = self.getDueDates(arrayOfDateDescriptionDivs)
        descriptionsList = self.getDescriptionsList(arrayOfDateDescriptionDivs)

        for i in range(len(titlesList)):
            title = CleanText.cleanALLtheText(titlesList[i])
            link = linksList[i]
            dueDate = dueDatesList[i]
            description = CleanText.cleanALLtheText(descriptionsList[i])
            sourceText = CleanText.cleanALLtheText(RipPage.getPageSource(link))

            scholarshipArray = [title, description, dueDate, link, sourceText]
            self.fatomeiLeadsArray.append(scholarshipArray)

示例#17

0

显示文件

文件： TrafficSafetyStoreLeads.py 项目： kyajpauley/cerebro

    def getSourceWebsitesAndSourceTexts(self):
        sourceWebsiteDivs = self.driver.find_elements_by_xpath(
            "//div[@class='col-xs-8 col-xs-offset-2']/a")
        sourceWebsitesList = [
            sourceWebsiteDiv.get_attribute('href')
            for sourceWebsiteDiv in sourceWebsiteDivs
        ]

        sourceTextsList = [
            RipPage.getPageSource(sourceWebsite)
            for sourceWebsite in sourceWebsitesList
        ]

        sourceTextsList = [
            CleanText.cleanALLtheText(sourceText)
            for sourceText in sourceTextsList
        ]

        return sourceWebsitesList, sourceTextsList

示例#18

0

显示文件

    def getLeads(self):
        arrayOfTitleLinkDivs = self.driver.find_elements_by_xpath(
            "//td[@class='f']/../preceding-sibling::tr[1]/td[@class='a']/a")
        arrayOfDateDescriptionDivs = self.driver.find_elements_by_xpath(
            "//tr/td[@class='f']/../td")

        titlesList = self.getTitlesList(arrayOfTitleLinkDivs)
        linksList = self.getLinksList(arrayOfTitleLinkDivs)
        dueDatesList = self.getDueDates(arrayOfDateDescriptionDivs)
        descriptionsList = self.getDescriptionsList(arrayOfDateDescriptionDivs)

        for i in range(len(titlesList)):
            title = CleanText.cleanALLtheText(titlesList[i])
            link = linksList[i]
            dueDate = dueDatesList[i]
            description = CleanText.cleanALLtheText(descriptionsList[i])
            sourceText = CleanText.cleanALLtheText(RipPage.getPageSource(link))

            scholarshipArray = [title, description, dueDate, link, sourceText]
            self.fatomeiLeadsArray.append(scholarshipArray)

示例#19

0

显示文件

文件： PivotLeads.py 项目： kyajmiller/Cerebro

    def makeLeadArrayAndAddToGrantForwardLeads(self, singleResultArray):
        name = CleanText.cleanALLtheText(singleResultArray[0])
        url = singleResultArray[1]
        resultPageInfo = self.goToResultPageAndPullInformation(url)

        keyword = CleanText.cleanALLtheText(self.searchTerm)
        abstract = CleanText.cleanALLtheText(resultPageInfo[6])
        sponsor = CleanText.cleanALLtheText(resultPageInfo[1])
        amount = CleanText.cleanALLtheText(resultPageInfo[2])
        applicantType = CleanText.cleanALLtheText(resultPageInfo[3])
        citizenshipResidency = CleanText.cleanALLtheText(resultPageInfo[4])
        activityLocation = CleanText.cleanALLtheText(resultPageInfo[5])
        eligibility = CleanText.cleanALLtheText(resultPageInfo[7])
        categories = CleanText.cleanALLtheText(resultPageInfo[8])
        sourceWebsite = resultPageInfo[0]
        sourceText = CleanText.cleanALLtheText(RipPage.getPageSource(sourceWebsite))

        singleLeadArray = [keyword, url, name, abstract, sponsor, amount, applicantType, citizenshipResidency,
                           activityLocation, eligibility, categories, sourceWebsite, sourceText]

        self.arrayOfPivotLeads.append(singleLeadArray)

示例#20

0

显示文件

文件： CollegeGreenLightLeads.py 项目： kyajpauley/cerebro

    def getLeads(self):
        arrayOfTitleLinkDivs = self.driver.find_elements_by_xpath(
            "//td[@class='scholarshipNameColumn']/div/a")
        arrayOfAmountDivs = self.driver.find_elements_by_xpath(
            "//td[@class='amount']")
        arrayOfDeadlineDivs = self.driver.find_elements_by_xpath(
            "//td[@class='deadline']")

        titlesList = self.getTitlesList(arrayOfTitleLinkDivs)
        linksList = self.getLinksList(arrayOfTitleLinkDivs)
        amountsList = self.getAmountsList(arrayOfAmountDivs)
        deadlinesList = self.getDeadlinesList(arrayOfDeadlineDivs)

        for i in range(len(titlesList)):
            title = CleanText.cleanALLtheText(titlesList[i])
            resultPageLink = linksList[i]
            amount = CleanText.cleanALLtheText(amountsList[i])
            deadline = deadlinesList[i]

            resultPageInfo = self.goToResultPageAndPullInformation(
                resultPageLink)
            sponsor = CleanText.cleanALLtheText(resultPageInfo[0])
            sourceWebsite = resultPageInfo[1]
            description = CleanText.cleanALLtheText(resultPageInfo[2])
            requirements = CleanText.cleanALLtheText(resultPageInfo[3])

            sourceText = ''
            if re.search('^https?://', sourceWebsite):
                sourceText = CleanText.cleanALLtheText(
                    RipPage.getPageSource(sourceWebsite))

            leadArray = [
                title, amount, deadline, sponsor, description, requirements,
                resultPageLink, sourceWebsite, sourceText
            ]
            self.collegeGreenLightLeadsArrays.append(leadArray)

        self.driver.quit()
        return self.collegeGreenLightLeadsArrays

示例#21

0

显示文件

文件： IefaLeads.py 项目： kyajmiller/Cerebro

    def goToResultsPageAndGetInfo(self, resultPageLink):
        self.driver.get(resultPageLink)
        self.driver.implicitly_wait(2)
        self.makeSureLoggedIn(resultPageLink)

        sponsor = ''
        submissionDeadline = ''
        majors = ''
        awardAmount = ''
        description = ''
        otherCriteria = ''
        numberAwards = ''
        hostInstitution = ''
        awardIncludes = ''
        nationalityRequired = ''
        hostCountries = ''
        sourceWebsite = ''
        sourceText = ''

        if self.checkIfElementExists("//span[@class='award-sponsor']"):
            sponsor = self.driver.find_element_by_xpath("//span[@class='award-sponsor']").get_attribute('textContent')
            sponsor = re.sub('^Sponsor:', '', sponsor)

        if self.checkIfElementExists("//h4[text() = 'Submission Deadline']/following-sibling::p"):
            submissionDeadline = self.driver.find_element_by_xpath(
                "//h4[text() = 'Submission Deadline']/following-sibling::p").get_attribute('textContent')

        if self.checkIfElementExists("//p[@id='award-fieldofstudy']"):
            majors = self.driver.find_element_by_xpath("//p[@id='award-fieldofstudy']").get_attribute('textContent')
            majors = re.sub('Unrestricted', '', majors)

        if self.checkIfElementExists("//p[@id='award-amount']"):
            awardAmount = self.driver.find_element_by_xpath("//p[@id='award-amount']").get_attribute('textContent')

        if self.checkIfElementExists(
                "//div[@class='award-description padding_bottom_30']/h4[text() = 'Description']/following-sibling::p[1]"):
            description = self.driver.find_element_by_xpath(
                "//div[@class='award-description padding_bottom_30']/h4[text() = 'Description']/following-sibling::p[1]").get_attribute(
                'textContent')

        if self.checkIfElementExists(
                "//div[@class='award-description padding_bottom_30']/h4[text() = 'Other Criteria']/following-sibling::p[1]"):
            otherCriteria = self.driver.find_element_by_xpath(
                "//div[@class='award-description padding_bottom_30']/h4[text() = 'Other Criteria']/following-sibling::p[1]").get_attribute(
                'textContent')

        if self.checkIfElementExists("//th[text() = 'Number of Awards']/following-sibling::td"):
            numberAwards = self.driver.find_element_by_xpath(
                "//th[text() = 'Number of Awards']/following-sibling::td").get_attribute('textContent')

        if self.checkIfElementExists("//th[text() = 'Host Institution']/following-sibling::td"):
            hostInstitution = self.driver.find_element_by_xpath(
                "//th[text() = 'Host Institution']/following-sibling::td").get_attribute('textContent')

        if self.checkIfElementExists("//th[text() = 'Includes']/following-sibling::td"):
            awardIncludes = self.driver.find_element_by_xpath(
                "//th[text() = 'Includes']/following-sibling::td").get_attribute('textContent')

        if self.checkIfElementExists("//th[text() = 'Nationality Required']/following-sibling::td"):
            nationalityRequired = self.driver.find_element_by_xpath(
                "//th[text() = 'Nationality Required']/following-sibling::td").get_attribute('textContent')
            nationalityRequired = re.sub('Unrestricted', '', nationalityRequired)

        if self.checkIfElementExists("//th[text() = 'Host Countries']/following-sibling::td"):
            hostCountries = self.driver.find_element_by_xpath(
                "//th[text() = 'Host Countries']/following-sibling::td").get_attribute('textContent')
            hostCountries = re.sub('Unrestricted', '', hostCountries)

        if self.checkIfElementExists("//th[text() = 'Link']/following-sibling::td/a"):
            sourceWebsite = self.driver.find_element_by_xpath(
                "//th[text() = 'Link']/following-sibling::td/a").get_attribute('href')
            sourceText = RipPage.getPageSource(sourceWebsite)

        resultPageInfoArray = [sponsor, submissionDeadline, majors, awardAmount, description, otherCriteria,
                               numberAwards, hostInstitution, awardIncludes, nationalityRequired, hostCountries,
                               sourceText]
        resultPageInfoArray = [CleanText.cleanALLtheText(item) for item in resultPageInfoArray]
        resultPageInfoArray.append(sourceWebsite)

        return resultPageInfoArray

示例#22

0

显示文件

文件： rippagetest.py 项目： kyajmiller/Cerebro

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import codecs

from Classes.RipPage import RipPage
from Classes.CleanText import CleanText

# rippedpage = RipPage.getPageSource('http://webapps.acs.org/findawards/detail.jsp?ContentId=CTP_004520')
ripped = RipPage.getPageSource('http://webapps.acs.org/findawards/detail.jsp?ContentId=CNBP_031057')
arr = bytearray(ripped, "utf-8")
print(str(arr))
ripped2=str(arr)
print(ripped2)

print(CleanText.cleanALLtheText(ripped2))

示例#23

0

显示文件

文件： testrippageurl.py 项目： kyajpauley/cerebro

from Classes.RipPage import RipPage

RipPage.getPageSource('http://engineering.berkeley.edu/')

cheese = 'cheese'
crackers = 'crackers'

cheesecrackers = 'cheese' + crackers
print(cheesecrackers)

示例#24

0

显示文件

文件： GrantForwardLeads.py 项目： kyajpauley/cerebro

    def goToResultPageAndPullInformation(self, resultPageLink):
        self.driver.get(resultPageLink)
        self.driver.implicitly_wait(2)
        description = ''
        sponsor = ''
        amount = ''
        eligibility = ''
        submissionInfo = ''
        categories = ''
        sourceWebsite = ''
        sourceText = ''
        deadline = ''

        if self.checkIfElementExists(
                "//div[@id = 'field-description']/div[@class = 'content-collapsed']"
        ):
            description = self.driver.find_element_by_xpath(
                "//div[@id = 'field-description']/div[@class = 'content-collapsed']"
            ).get_attribute('textContent')
            description = CleanText.cleanALLtheText(description)

        if self.checkIfElementExists(
                "//div[@class = 'sponsor-content']/div/a"):
            sponsor = self.driver.find_element_by_xpath(
                "//div[@class = 'sponsor-content']/div/a").get_attribute(
                    'textContent')
            sponsor = CleanText.cleanALLtheText(sponsor)

        if self.checkIfElementExists(
                "//div[@id = 'field-amount_info']/div[@class = 'content-collapsed']"
        ):
            amount = self.driver.find_element_by_xpath(
                "//div[@id = 'field-amount_info']/div[@class = 'content-collapsed']"
            ).get_attribute('textContent')
            amount = CleanText.cleanALLtheText(amount)

        if self.checkIfElementExists(
                "//div[@id = 'field-eligibility']/div[@class = 'content-collapsed']"
        ):
            eligibility = self.driver.find_element_by_xpath(
                "//div[@id = 'field-eligibility']/div[@class = 'content-collapsed']"
            ).get_attribute('textContent')
            eligibility = CleanText.cleanALLtheText(eligibility)

        if self.checkIfElementExists(
                "//div[@id = 'field-submission_info']/div[@class = 'content-collapsed']"
        ):
            submissionInfo = self.driver.find_element_by_xpath(
                "//div[@id = 'field-submission_info']/div[@class = 'content-collapsed']"
            ).get_attribute('textContent')
            submissionInfo = CleanText.cleanALLtheText(submissionInfo)

        if self.checkIfElementExists("//div[@id = 'field-subjects']/ul"):
            categories = self.driver.find_element_by_xpath(
                "//div[@id = 'field-subjects']/ul").get_attribute(
                    'textContent')
            categories = CleanText.cleanALLtheText(categories)

        if self.checkIfElementExists(
                "//a[@class = 'source-link btn btn-warning']"):
            sourceWebsite = self.driver.find_element_by_xpath(
                "//a[@class = 'source-link btn btn-warning']").get_attribute(
                    'href')
            sourceText = CleanText.cleanALLtheText(
                RipPage.getPageSource(sourceWebsite))

        if self.checkIfElementExists(
                "//div[@class='table-responsive deadline-tables']/table/tbody"
        ):
            deadline = self.driver.find_element_by_xpath(
                "//div[@class='table-responsive deadline-tables']/table/tbody"
            ).get_attribute('textContent')
            deadline = CleanText.cleanALLtheText(deadline)

        resultPageInfo = [
            description, sponsor, amount, eligibility, submissionInfo,
            categories, sourceWebsite, sourceText, deadline
        ]
        return resultPageInfo

示例#25

0

显示文件

文件： GoodCallLeads.py 项目： kyajpauley/cerebro

    def getInfoFromResultPage(self, resultPageLink):
        self.driver.get(resultPageLink)
        self.driver.implicitly_wait(2)

        description = ''
        sponsor = ''
        classStatus = ''
        major = ''
        gender = ''
        ethnicity = ''
        grades = ''
        testScores = ''
        geography = ''
        deadline = ''
        essayInfo = ''
        sourceWebsite = ''
        sourceText = ''

        if self.checkIfElementExists("//div[@id='main-column']/p[1]"):
            description = self.driver.find_element_by_xpath(
                "//div[@id='main-column']/p[1]").get_attribute('textContent')
            description = CleanText.cleanALLtheText(description)

        if self.checkIfElementExists("//div[@id='main-column']/p[2]"):
            sponsor = self.driver.find_element_by_xpath(
                "//div[@id='main-column']/p[2]").get_attribute('textContent')
            sponsor = CleanText.cleanALLtheText(re.sub('Sponsor:', '',
                                                       sponsor))

        if self.checkIfElementExists(
                "//tr/td[contains(text(), 'School Year')]/following-sibling::td"
        ):
            classStatus = self.driver.find_element_by_xpath(
                "//tr/td[contains(text(), 'School Year')]/following-sibling::td"
            ).get_attribute('textContent')
            classStatus = CleanText.cleanALLtheText(classStatus)
            classStatus = re.sub('No Restrictions Listed', '', classStatus)
            classStatus = re.sub('No Restrictions', '', classStatus)

        if self.checkIfElementExists(
                "//tr/td[contains(text(), 'School Year')]/following-sibling::td"
        ):
            major = self.driver.find_element_by_xpath(
                "//tr/td[contains(text(), 'School Year')]/following-sibling::td"
            ).get_attribute('textContent')
            major = CleanText.cleanALLtheText(major)
            major = re.sub('No Restrictions Listed', '', major)
            major = re.sub('No Restrictions', '', major)

        if self.checkIfElementExists(
                "//tr/td[contains(text(), 'Gender')]/following-sibling::td"):
            gender = self.driver.find_element_by_xpath(
                "//tr/td[contains(text(), 'Gender')]/following-sibling::td"
            ).get_attribute('textContent')
            gender = CleanText.cleanALLtheText(gender)
            gender = re.sub('No Restrictions Listed', '', gender)
            gender = re.sub('No Restrictions', '', gender)

        if self.checkIfElementExists(
                "//tr/td[contains(text(), 'Ethnicity')]/following-sibling::td"
        ):
            ethnicity = self.driver.find_element_by_xpath(
                "//tr/td[contains(text(), 'Ethnicity')]/following-sibling::td"
            ).get_attribute('textContent')
            ethnicity = CleanText.cleanALLtheText(ethnicity)
            ethnicity = re.sub('No Restrictions Listed', '', ethnicity)
            ethnicity = re.sub('No Restrictions', '', ethnicity)

        if self.checkIfElementExists(
                "//tr/td[contains(text(), 'Grades')]/following-sibling::td"):
            grades = self.driver.find_element_by_xpath(
                "//tr/td[contains(text(), 'Grades')]/following-sibling::td"
            ).get_attribute('textContent')
            grades = CleanText.cleanALLtheText(grades)
            grades = re.sub('No Restrictions Listed', '', grades)
            grades = re.sub('No Restrictions', '', grades)

        if self.checkIfElementExists(
                "//tr/td[contains(text(), 'Test Scores')]/following-sibling::td"
        ):
            testScores = self.driver.find_element_by_xpath(
                "//tr/td[contains(text(), 'Test Scores')]/following-sibling::td"
            ).get_attribute('textContent')
            testScores = CleanText.cleanALLtheText(testScores)
            testScores = re.sub('No Restrictions Listed', '', testScores)
            testScores = re.sub('No Restrictions', '', testScores)

        if self.checkIfElementExists(
                "//tr/td[contains(text(), 'Geography')]/following-sibling::td"
        ):
            geography = self.driver.find_element_by_xpath(
                "//tr/td[contains(text(), 'Geography')]/following-sibling::td"
            ).get_attribute('textContent')
            geography = CleanText.cleanALLtheText(geography)
            geography = re.sub('No Restrictions Listed', '', geography)
            geography = re.sub('No Restrictions', '', geography)

        if self.checkIfElementExists("//span[@class='deadline data']"):
            deadline = self.driver.find_element_by_xpath(
                "//span[@class='deadline data']").get_attribute('textContent')
            deadline = CleanText.cleanALLtheText(
                re.sub('\(\.*?\)', '', deadline))

        if self.checkIfElementExists(
                "//div[@class='listing-info']/h3[contains(text(), 'Essay')]/following-sibling::p"
        ):
            essayPart1 = self.driver.find_elements_by_xpath(
                "//div[@class='listing-info']/h3[contains(text(), 'Essay')]/following-sibling::p"
            )

            if self.checkIfElementExists("//div[@id='essay-length']") and len(
                    self.driver.find_elements_by_xpath(
                        "//div[@id='essay-length']")) == len(essayPart1):
                essayPart2 = self.driver.find_elements_by_xpath(
                    "//div[@id='essay-length']")
                combinedParts = []
                for i in range(len(essayPart1)):
                    part1 = CleanText.cleanALLtheText(
                        essayPart1[i].get_attribute('textContent'))
                    part2 = CleanText.cleanALLtheText(
                        essayPart2[i].get_attribute('textContent'))
                    combined = '%s %s' % (part1, part2)
                    combinedParts.append(combined)
                    essayInfo = ' '.join(combinedParts)
            else:
                essayInfo = [
                    CleanText.cleanALLtheText(
                        essayPart.get_attribute('textContent'))
                    for essayPart in essayPart1
                ]
                essayInfo = ' '.join(essayInfo)

        if self.checkIfElementExists("//a[@class='action-button visit-site']"):
            sourceWebsite = self.driver.find_element_by_xpath(
                "//a[@class='action-button visit-site']").get_attribute('href')
            sourceText = CleanText.cleanALLtheText(
                RipPage.getPageSource(sourceWebsite))

        resultPageInfoArray = [
            description, sponsor, classStatus, major, gender, ethnicity,
            grades, testScores, geography, deadline, essayInfo, sourceWebsite,
            sourceText
        ]

        return resultPageInfoArray

示例#26

0

显示文件

文件： rippagetest.py 项目： kyajpauley/cerebro

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import codecs

from Classes.RipPage import RipPage
from Classes.CleanText import CleanText

# rippedpage = RipPage.getPageSource('http://webapps.acs.org/findawards/detail.jsp?ContentId=CTP_004520')
ripped = RipPage.getPageSource(
    'http://webapps.acs.org/findawards/detail.jsp?ContentId=CNBP_031057')
arr = bytearray(ripped, "utf-8")
print(str(arr))
ripped2 = str(arr)
print(ripped2)

print(CleanText.cleanALLtheText(ripped2))

示例#27

0

显示文件

    def getLeads(self):
        titleDivs = self.driver.find_elements_by_xpath(
            "//h3[not(ancestor::div[@id='scholarship_intro_859'])]")

        for i in range(len(titleDivs)):
            title = titleDivs[i].get_attribute('textContent')
            requirements = ''
            sourceWebsite = ''
            description = ''

            if title != 'Quick Links' and title != 'About Us':
                if i == 0:
                    description = self.driver.find_element_by_xpath(
                        "//div[@class='intro']/p").get_attribute('textContent')
                    sourceWebsite = self.driver.find_element_by_xpath(
                        "//div[@class='intro']/p/a").get_attribute('href')
                    requirements = self.driver.find_element_by_xpath(
                        "//div[@class='intro']/following-sibling::*[1][self::ul]"
                    ).get_attribute('textContent')
                else:
                    j = i + 1
                    if self.checkIfElementExists(
                            "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[1]"
                            % j):
                        description = self.driver.find_element_by_xpath(
                            "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[1]"
                            % j).get_attribute('textContent')
                    if self.checkIfElementExists(
                            "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]"
                            % j):
                        requirements = self.driver.find_element_by_xpath(
                            "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]"
                            % j).get_attribute('textContent')

                    if self.checkIfElementExists(
                            "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[1]/a"
                            % j):
                        sourceWebsite = self.driver.find_element_by_xpath(
                            "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[1]/a"
                            % j).get_attribute('href')
                    elif self.checkIfElementExists(
                            "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]"
                            % j):
                        if self.checkIfElementExists(
                                "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]/a"
                                % j):
                            sourceWebsite = self.driver.find_element_by_xpath(
                                "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]/a"
                                % j).get_attribute('href')

                sourceText = RipPage.getPageSource(sourceWebsite)

                title = CleanText.cleanALLtheText(title)
                description = CleanText.cleanALLtheText(description)
                requirements = CleanText.cleanALLtheText(requirements)
                sourceText = CleanText.cleanALLtheText(sourceText)

                leadArray = [
                    title, description, requirements, sourceWebsite, sourceText
                ]

                self.teacherDotOrgLeadArrays.append(leadArray)
        self.driver.close()
        return self.teacherDotOrgLeadArrays

示例#28

0

显示文件

文件： TestRipPage.py 项目： kyajpauley/cerebro

 def test_RipPage(self):
     test_rippage = RipPage.getPageSource('https://www.google.com/')
     self.assertIsNotNone(test_rippage)
     self.assertGreater(len(test_rippage), 10)

示例#29

0

显示文件

文件： GoodCallLeads.py 项目： kyajmiller/Cerebro

    def getInfoFromResultPage(self, resultPageLink):
        self.driver.get(resultPageLink)
        self.driver.implicitly_wait(2)

        description = ''
        sponsor = ''
        classStatus = ''
        major = ''
        gender = ''
        ethnicity = ''
        grades = ''
        testScores = ''
        geography = ''
        deadline = ''
        essayInfo = ''
        sourceWebsite = ''
        sourceText = ''

        if self.checkIfElementExists("//div[@id='main-column']/p[1]"):
            description = self.driver.find_element_by_xpath("//div[@id='main-column']/p[1]").get_attribute(
                'textContent')
            description = CleanText.cleanALLtheText(description)

        if self.checkIfElementExists("//div[@id='main-column']/p[2]"):
            sponsor = self.driver.find_element_by_xpath("//div[@id='main-column']/p[2]").get_attribute('textContent')
            sponsor = CleanText.cleanALLtheText(re.sub('Sponsor:', '', sponsor))

        if self.checkIfElementExists("//tr/td[contains(text(), 'School Year')]/following-sibling::td"):
            classStatus = self.driver.find_element_by_xpath(
                "//tr/td[contains(text(), 'School Year')]/following-sibling::td").get_attribute('textContent')
            classStatus = CleanText.cleanALLtheText(classStatus)
            classStatus = re.sub('No Restrictions Listed', '', classStatus)
            classStatus = re.sub('No Restrictions', '', classStatus)

        if self.checkIfElementExists("//tr/td[contains(text(), 'School Year')]/following-sibling::td"):
            major = self.driver.find_element_by_xpath(
                "//tr/td[contains(text(), 'School Year')]/following-sibling::td").get_attribute('textContent')
            major = CleanText.cleanALLtheText(major)
            major = re.sub('No Restrictions Listed', '', major)
            major = re.sub('No Restrictions', '', major)

        if self.checkIfElementExists("//tr/td[contains(text(), 'Gender')]/following-sibling::td"):
            gender = self.driver.find_element_by_xpath(
                "//tr/td[contains(text(), 'Gender')]/following-sibling::td").get_attribute('textContent')
            gender = CleanText.cleanALLtheText(gender)
            gender = re.sub('No Restrictions Listed', '', gender)
            gender = re.sub('No Restrictions', '', gender)

        if self.checkIfElementExists("//tr/td[contains(text(), 'Ethnicity')]/following-sibling::td"):
            ethnicity = self.driver.find_element_by_xpath(
                "//tr/td[contains(text(), 'Ethnicity')]/following-sibling::td").get_attribute('textContent')
            ethnicity = CleanText.cleanALLtheText(ethnicity)
            ethnicity = re.sub('No Restrictions Listed', '', ethnicity)
            ethnicity = re.sub('No Restrictions', '', ethnicity)

        if self.checkIfElementExists("//tr/td[contains(text(), 'Grades')]/following-sibling::td"):
            grades = self.driver.find_element_by_xpath(
                "//tr/td[contains(text(), 'Grades')]/following-sibling::td").get_attribute('textContent')
            grades = CleanText.cleanALLtheText(grades)
            grades = re.sub('No Restrictions Listed', '', grades)
            grades = re.sub('No Restrictions', '', grades)

        if self.checkIfElementExists("//tr/td[contains(text(), 'Test Scores')]/following-sibling::td"):
            testScores = self.driver.find_element_by_xpath(
                "//tr/td[contains(text(), 'Test Scores')]/following-sibling::td").get_attribute('textContent')
            testScores = CleanText.cleanALLtheText(testScores)
            testScores = re.sub('No Restrictions Listed', '', testScores)
            testScores = re.sub('No Restrictions', '', testScores)

        if self.checkIfElementExists("//tr/td[contains(text(), 'Geography')]/following-sibling::td"):
            geography = self.driver.find_element_by_xpath(
                "//tr/td[contains(text(), 'Geography')]/following-sibling::td").get_attribute('textContent')
            geography = CleanText.cleanALLtheText(geography)
            geography = re.sub('No Restrictions Listed', '', geography)
            geography = re.sub('No Restrictions', '', geography)

        if self.checkIfElementExists("//span[@class='deadline data']"):
            deadline = self.driver.find_element_by_xpath("//span[@class='deadline data']").get_attribute('textContent')
            deadline = CleanText.cleanALLtheText(re.sub('\(\.*?\)', '', deadline))

        if self.checkIfElementExists("//div[@class='listing-info']/h3[contains(text(), 'Essay')]/following-sibling::p"):
            essayPart1 = self.driver.find_elements_by_xpath(
                "//div[@class='listing-info']/h3[contains(text(), 'Essay')]/following-sibling::p")

            if self.checkIfElementExists("//div[@id='essay-length']") and len(
                    self.driver.find_elements_by_xpath("//div[@id='essay-length']")) == len(essayPart1):
                essayPart2 = self.driver.find_elements_by_xpath("//div[@id='essay-length']")
                combinedParts = []
                for i in range(len(essayPart1)):
                    part1 = CleanText.cleanALLtheText(essayPart1[i].get_attribute('textContent'))
                    part2 = CleanText.cleanALLtheText(essayPart2[i].get_attribute('textContent'))
                    combined = '%s %s' % (part1, part2)
                    combinedParts.append(combined)
                    essayInfo = ' '.join(combinedParts)
            else:
                essayInfo = [CleanText.cleanALLtheText(essayPart.get_attribute('textContent')) for essayPart in
                             essayPart1]
                essayInfo = ' '.join(essayInfo)

        if self.checkIfElementExists("//a[@class='action-button visit-site']"):
            sourceWebsite = self.driver.find_element_by_xpath("//a[@class='action-button visit-site']").get_attribute(
                'href')
            sourceText = CleanText.cleanALLtheText(RipPage.getPageSource(sourceWebsite))

        resultPageInfoArray = [description, sponsor, classStatus, major, gender, ethnicity, grades, testScores,
                               geography, deadline, essayInfo, sourceWebsite, sourceText]

        return resultPageInfoArray

示例#30

0

显示文件

文件： IefaLeads.py 项目： kyajpauley/cerebro

    def goToResultsPageAndGetInfo(self, resultPageLink):
        self.driver.get(resultPageLink)
        self.driver.implicitly_wait(2)
        self.makeSureLoggedIn(resultPageLink)

        sponsor = ''
        submissionDeadline = ''
        majors = ''
        awardAmount = ''
        description = ''
        otherCriteria = ''
        numberAwards = ''
        hostInstitution = ''
        awardIncludes = ''
        nationalityRequired = ''
        hostCountries = ''
        sourceWebsite = ''
        sourceText = ''

        if self.checkIfElementExists("//span[@class='award-sponsor']"):
            sponsor = self.driver.find_element_by_xpath(
                "//span[@class='award-sponsor']").get_attribute('textContent')
            sponsor = re.sub('^Sponsor:', '', sponsor)

        if self.checkIfElementExists(
                "//h4[text() = 'Submission Deadline']/following-sibling::p"):
            submissionDeadline = self.driver.find_element_by_xpath(
                "//h4[text() = 'Submission Deadline']/following-sibling::p"
            ).get_attribute('textContent')

        if self.checkIfElementExists("//p[@id='award-fieldofstudy']"):
            majors = self.driver.find_element_by_xpath(
                "//p[@id='award-fieldofstudy']").get_attribute('textContent')
            majors = re.sub('Unrestricted', '', majors)

        if self.checkIfElementExists("//p[@id='award-amount']"):
            awardAmount = self.driver.find_element_by_xpath(
                "//p[@id='award-amount']").get_attribute('textContent')

        if self.checkIfElementExists(
                "//div[@class='award-description padding_bottom_30']/h4[text() = 'Description']/following-sibling::p[1]"
        ):
            description = self.driver.find_element_by_xpath(
                "//div[@class='award-description padding_bottom_30']/h4[text() = 'Description']/following-sibling::p[1]"
            ).get_attribute('textContent')

        if self.checkIfElementExists(
                "//div[@class='award-description padding_bottom_30']/h4[text() = 'Other Criteria']/following-sibling::p[1]"
        ):
            otherCriteria = self.driver.find_element_by_xpath(
                "//div[@class='award-description padding_bottom_30']/h4[text() = 'Other Criteria']/following-sibling::p[1]"
            ).get_attribute('textContent')

        if self.checkIfElementExists(
                "//th[text() = 'Number of Awards']/following-sibling::td"):
            numberAwards = self.driver.find_element_by_xpath(
                "//th[text() = 'Number of Awards']/following-sibling::td"
            ).get_attribute('textContent')

        if self.checkIfElementExists(
                "//th[text() = 'Host Institution']/following-sibling::td"):
            hostInstitution = self.driver.find_element_by_xpath(
                "//th[text() = 'Host Institution']/following-sibling::td"
            ).get_attribute('textContent')

        if self.checkIfElementExists(
                "//th[text() = 'Includes']/following-sibling::td"):
            awardIncludes = self.driver.find_element_by_xpath(
                "//th[text() = 'Includes']/following-sibling::td"
            ).get_attribute('textContent')

        if self.checkIfElementExists(
                "//th[text() = 'Nationality Required']/following-sibling::td"):
            nationalityRequired = self.driver.find_element_by_xpath(
                "//th[text() = 'Nationality Required']/following-sibling::td"
            ).get_attribute('textContent')
            nationalityRequired = re.sub('Unrestricted', '',
                                         nationalityRequired)

        if self.checkIfElementExists(
                "//th[text() = 'Host Countries']/following-sibling::td"):
            hostCountries = self.driver.find_element_by_xpath(
                "//th[text() = 'Host Countries']/following-sibling::td"
            ).get_attribute('textContent')
            hostCountries = re.sub('Unrestricted', '', hostCountries)

        if self.checkIfElementExists(
                "//th[text() = 'Link']/following-sibling::td/a"):
            sourceWebsite = self.driver.find_element_by_xpath(
                "//th[text() = 'Link']/following-sibling::td/a").get_attribute(
                    'href')
            sourceText = RipPage.getPageSource(sourceWebsite)

        resultPageInfoArray = [
            sponsor, submissionDeadline, majors, awardAmount, description,
            otherCriteria, numberAwards, hostInstitution, awardIncludes,
            nationalityRequired, hostCountries, sourceText
        ]
        resultPageInfoArray = [
            CleanText.cleanALLtheText(item) for item in resultPageInfoArray
        ]
        resultPageInfoArray.append(sourceWebsite)

        return resultPageInfoArray