Python modelInit示例，scraperModelGS.modelInit Python示例

示例#1

0

显示文件

    ## Copy all elements from sheet row read
    for elm in row:
        record[recordKeys[key]] = elm
        key += 1

    ## Fill in remainder with empty strings
    while key < len(recordKeys):
        record[recordKeys[key]] = ''
        key += 1

    return record


if __name__ == '__main__':
    # Initialize Google Sheets for Write
    get_credentials = smgs.modelInit()

    # Get Headers from google sheets
    print('KEYS')
    contactKeys = getContactKeys()
    directoryKeys = getAgencyDirKeys()
    print('')

    # Get contact and orginization website data and structure with collected headings
    print('RECORDS')
    contactRecords = [sheetRecord(row, contactKeys) for row in getContacts()]
    orgRecords = [sheetRecord(row, directoryKeys) for row in getAgencyDir()]
    print('')

    # Create Dataframes
    cr = pd.DataFrame(contactRecords)

示例#2

0

显示文件

文件： directoryManager.py 项目： BARarch/contacts-scraper

class DirectoryManager(object):
    ## The goal of this class is to manage the directory in the enviroment not to be one!
    ## the directory as well as it access and packaging functions will opporate as utility functions, these
    ## the routines of this class will call those utility functions
    get_credentials = smgs.modelInit()
    scraperQueue = None

    def __init__(self, orgRecords):
        self.orgRecords = orgRecords
        #self.get_credentials = smgs.modelInit()
        #self.browser = webdriver.Chrome(path_to_chromedriver)

    def findOrgRecord(self, organization):
        for org in self.orgRecords:
            if organization == org['Organization']:
                return org

    def get_organizations(self):
        return [x['Organization'] for x in self.orgRecords]

    def orgRecordIndex(self, orgRecord):
        return self.orgRecords.index(orgRecord)

    def linkList(self, orgRecord):
        lis = [
            orgRecord['Directory link'], orgRecord['Link 2'],
            orgRecord['Link 3'], orgRecord['Link 4']
        ]
        return lis[:lis.index('')]

    def writeRecordRow(self, row, index):
        """Google Sheets API Code.
        """

        try:
            timeString = '{} {}'.format(str(row[0][1]), row[0][2])
            print(timeString)
            DirectoryManager.dir_on_with_time('__recordRow', timeString)
        except BaseException as e:
            print(e)

        credentials = DirectoryManager.get_credentials()
        http = credentials.authorize(smgs.httplib2.Http())
        discoveryUrl = ('https://sheets.googleapis.com/$discovery/rest?'
                        'version=v4')
        service = smgs.discovery.build('sheets',
                                       'v4',
                                       http=http,
                                       discoveryServiceUrl=discoveryUrl)

        spreadsheet_id = '1p1LNyQhNhDBNEOkYQPV9xcNRe60WDlmnuiPp78hxkIs'
        value_input_option = 'RAW'
        rangeName = 'Org Leadership Websites!F' + str(index + 2)
        values = row
        body = {'values': values}

        result = service.spreadsheets().values().update(
            spreadsheetId=spreadsheet_id,
            range=rangeName,
            valueInputOption=value_input_option,
            body=body).execute()
        DirectoryManager.dir_off('__recordRow')

        return result

    def writeRecordNote(self, note, index):
        """Google Sheets API Code.
        """
        DirectoryManager.dir_on('__recordNote')

        credentials = DirectoryManager.get_credentials()
        http = credentials.authorize(smgs.httplib2.Http())
        discoveryUrl = ('https://sheets.googleapis.com/$discovery/rest?'
                        'version=v4')
        service = smgs.discovery.build('sheets',
                                       'v4',
                                       http=http,
                                       discoveryServiceUrl=discoveryUrl)

        spreadsheet_id = '1p1LNyQhNhDBNEOkYQPV9xcNRe60WDlmnuiPp78hxkIs'
        value_input_option = 'RAW'
        rangeName = 'Org Leadership Websites!J' + str(index + 2)
        values = [[note]]
        body = {'values': values}

        result = service.spreadsheets().values().update(
            spreadsheetId=spreadsheet_id,
            range=rangeName,
            valueInputOption=value_input_option,
            body=body).execute()
        DirectoryManager.dir_off('__recordNote')
        return result

    @classmethod
    def set_app_scraper_queue(cls, q):
        DirectoryManager.scraperQueue = q

    @classmethod
    def dir_on(cls, place):
        if DirectoryManager.scraperQueue:
            DirectoryManager.scraperQueue.put({'__DIRON': place})

    @classmethod
    def dir_on_with_time(cls, place, timeString):
        if DirectoryManager.scraperQueue:
            DirectoryManager.scraperQueue.put({
                '__DIRON': place,
                'time': timeString
            })

    @classmethod
    def dir_off(cls, place):
        if DirectoryManager.scraperQueue:
            DirectoryManager.scraperQueue.put({'__DIROFF': place})

    @classmethod
    def change_on(cls, place):
        if DirectoryManager.scraperQueue:
            DirectoryManager.scraperQueue.put({'__NEWBROWSERON': place})

    @classmethod
    def change_off(cls, place):
        if DirectoryManager.scraperQueue:
            DirectoryManager.scraperQueue.put({'__NEWBROWSEROFF': place})

示例#3

0

显示文件

    def run(self):
        get_credentials_method = smgs.modelInit()

        # Get Headers from google sheets
        print('KEYS')
        self.startupQueue.put({'progress': 1})
        self.startupQueue.put({'message': 'KEYS'})
        contactKeys = getContactKeys(get_credentials_method)
        self.startupQueue.put({'progress': 2})
        directoryKeys = getAgencyDirKeys(get_credentials_method)
        self.startupQueue.put({'progress': 3})
        print('')

        # Get contact and orginization website data and structure with collected headings
        print('RECORDS')
        self.startupQueue.put({'message': 'RECORDS'})
        contactRecords = [
            sheetRecord(row, contactKeys)
            for row in getContacts(get_credentials_method)
        ]
        self.startupQueue.put({'progress': 4})
        self.orgRecords = [
            sheetRecord(row, directoryKeys)
            for row in getAgencyDir(get_credentials_method)
        ]
        self.startupQueue.put({'progress': 5})
        print('')

        # Create Dataframes
        cr = pd.DataFrame(contactRecords)
        dr = pd.DataFrame(self.orgRecords)
        print('DATAFRAMES READY')
        self.startupQueue.put({'message': 'DATAFRAMES READY', 'progress': 6})
        ## //////////////////  Initialize Contact Checker Classes with Fresh Data  \\\\\\\\\\\\\\\\\\\

        # Setup Contact Record Output
        cc.ContactSheetOutput.set_output(contactKeys)
        self.startupQueue.put({'progress': 7})
        # For this scrape session Give the Verification Handler class an Orgsession with Organization Records
        dm.OrgSession.set_browser_path(
        )  ## IMPORTANT STEP: The browser path must be set to the current working directory which varies for different machines
        cc.VerificationHandler.set_orgRecords(
            dm.HeadlessOrgSession(self.orgRecords))
        #self.queue.put({'progress': 'Finishd'})
        # For this scrape session Give the Verification Handler class the contact record data
        cc.VerificationHandler.set_contactRecords(cr)

        cc.ScrapeSession.set_app_scraper_queue(self.scraperQueue)
        cc.ScrapeSession.set_app_command_queue(self.commandQueue)
        print('CONTACT CHECKER READY')

        print('SCRAPE SESSION OPEN')
        print('')
        self.startupQueue.put({
            'message': 'SCRAPE SESSION OPEN',
            'progress': 'FINNISHED'
        })

        ## //////////////////        Begin Scraper Loop         \\\\\\\\\\\\\\\\\\\
        self.commandLoop = True

        while self.commandLoop:
            self.commandLoop = self.listen_for_cmd()

        cc.VerificationHandler.close_browser()
        print('SCRAPER THREAD FINNISHED')

示例#4

0

显示文件

文件： scraperThread.py 项目： BARarch/contacts-scraper

    def start_scraper(self):
        get_credentials_method = smgs.modelInit()

        # Get Headers from google sheets
        print('KEYS')
        self.startupQueue.put({'progress': 'START'})
        self.startupQueue.put({'message': 'KEYS',
                               '__waiting': ScraperThread.ContactKeysVal})
        contactKeys = getContactKeys(get_credentials_method)
        self.startupQueue.put({'progress': 1,
                               '__ready': ScraperThread.ContactKeysVal,
                               '__waiting': ScraperThread.DirectoryKeysVal})
        directoryKeys = getAgencyDirKeys(get_credentials_method)
        self.startupQueue.put({'progress': 2,
                               '__ready': ScraperThread.DirectoryKeysVal})
        print('')

        # Get contact and orginization website data and structure with collected headings
        print('RECORDS')
        self.startupQueue.put({'message': 'RECORDS',
                               '__waiting': ScraperThread.ContactRecordsVal})
        contactRecords = [sheetRecord(row, contactKeys) for row in getContacts(get_credentials_method)]
        self.startupQueue.put({'progress': 3,
                               '__ready': ScraperThread.ContactRecordsVal,
                               '__waiting': ScraperThread.AgencyDirectoryVal})
        self.orgRecords = [sheetRecord(row, directoryKeys) for row in getAgencyDir(get_credentials_method)]
        self.startupQueue.put({'progress': 4,
                               '__ready': ScraperThread.AgencyDirectoryVal})
        print('')

        # Create Dataframes
        self.startupQueue.put({'__waiting': ScraperThread.DataVal})
        cr = pd.DataFrame(contactRecords)
        dr = pd.DataFrame(self.orgRecords)
        print('DATAFRAMES READY') 
        self.startupQueue.put({'message': 'DATAFRAMES READY',
                               'progress': 5,
                               '__ready': ScraperThread.DataVal})
        ## //////////////////  Initialize Contact Checker Classes with Fresh Data  \\\\\\\\\\\\\\\\\\\

        # Setup Contact Record Output
        self.startupQueue.put({'__waiting': ScraperThread.OutputVal})
        cc.ContactSheetOutput.set_output(contactKeys)
        self.startupQueue.put({'progress': 6,
                               '__ready': ScraperThread.OutputVal,
                               '__waiting': ScraperThread.BrowserDriverVal})
        # For this scrape session Give the Verification Handler class an Orgsession with Organization Records
        dm.OrgSession.set_browser_path()                                 ## IMPORTANT STEP: The browser path must be set to the current working directory which varies for different machines
        cc.VerificationHandler.set_orgRecords(dm.HeadlessOrgSession(self.orgRecords))
        self.startupQueue.put({'progress': 7,
                               '__ready': ScraperThread.BrowserDriverVal,
                               '__waiting': ScraperThread.ContactCheckerVal})
        # For this scrape session Give the Verification Handler class the contact record data
        cc.VerificationHandler.set_contactRecords(cr)
        cc.ScrapeSession.set_app_scraper_queue(self.scraperQueue)
        cc.ContactSheetOutput.set_app_scraper_queue(self.scraperQueue)
        cc.ContactCollector.set_app_scraper_queue(self.scraperQueue)
        dm.DirectoryManager.set_app_scraper_queue(self.scraperQueue)
        dm.OrgQuery.set_app_scraper_queue(self.scraperQueue)
        cc.ScrapeSession.set_app_command_queue(self.commandQueue)

        ## Count Rows and Finnish up
        self.startupQueue.put({'rowCounts': {'contact counts': cc.ContactSheetOutput.count_contacts_rows(),
                                             'output counts': cc.ContactSheetOutput.count_scraper_output_rows()}})
        print('CONTACT CHECKER READY')
        print('SCRAPE SESSION OPEN')
        print('')
        self.startupQueue.put({'message': 'SCRAPE SESSION OPEN',
                               'progress': 'FINNISHED',
                               '__ready': ScraperThread.ContactCheckerVal})