def __init__(self, strConfigAppName, strConfigSect): #load redfind website self._cfg = XmlConfigReader.Config(strConfigAppName, strConfigSect) strBrowser = self._cfg.getConfigValue("Browser") print("Opening {0} browser".format(strBrowser)) if strBrowser == "Chrome": self._driver = webdriver.Chrome() self.SignIntoRF()
def __init__(self, strConfigFilePath, strConfigSect): #initialize the object, and connect to har print("Initialize Matrix scrapper") self._cfg = XmlConfigReader.Config(strConfigFilePath, strConfigSect) executable_path = self._cfg.getConfigValue("GeckoPath") strBrowser = self._cfg.getConfigValue("Browser") print("Opening {0} browser".format(strBrowser)) if strBrowser == "Chrome": self._driver = webdriver.Chrome() #self.SignIntoMatrix() return
def __init__(self, strConfigFilePath, strConfigSect): self._cfg = XmlConfigReader.Config(strConfigFilePath, strConfigSect) strUserName = self._cfg.getConfigValue("HARUserName") strPwd = self._cfg.getConfigValue("HARPassword") strEntryUrl = self._cfg.getConfigValue("EntryUrl") executable_path = self._cfg.getConfigValue("GeckoPath") binary = FirefoxBinary(self._cfg.getConfigValue("FireFoxBinary")) self._driver = webdriver.Firefox(executable_path=executable_path) # driver = webdriver.Firefox(firefox_binary=binary) print(self._cfg.getConfigValue("EntryUrl")) self._driver.get( self._cfg.getConfigValue("StartingUrl")) # load the web page # look for user name log in: elemUsr = self._driver.find_element_by_id("member_email") elemUsr.send_keys(strUserName) elemPwd = self._driver.find_element_by_id("member_pass") elemPwd.send_keys(strPwd) elemPwd.send_keys(Keys.RETURN) (elemNextLnk, nFailureCnt) = self.find_wait_get_element("link_text", "Enter Matrix MLS") window_before = self._driver.window_handles[0] xpath = "/html[@class='wf-effra-n4-active wf-effra-n7-active wf-effra-n3-active wf-effra-n5-active wf-effra-n9-active wf-active']/body/div[@class='content overlay']/div[@class='container']/div[@class='rightPane']/div[@class='box_simple gray agentbox newhar']/div[@class='box_content grid_view']/a[1]" (elemNextLnk, nFailureCnt) = self.find_wait_get_element("xpath", xpath, True) time.sleep(3) self.wait_for_new_window(self._driver) window_after = self._driver.window_handles[1] self._driver.close() self._driver.switch_to.window(window_after) return
def scrapSoldProperties(datFrom, datTo, nJobId): cfg = XmlConfigReader.Config("AllPropScrapper", "DEV") strUserName = cfg.getConfigValue("HARUserName") strPwd = cfg.getConfigValue("HARPassword") strEntryUrl = cfg.getConfigValue("EntryUrl") # strUrl = str(cfg.getConfigValue("EntryUrl")) executable_path = r'C:\Python35\selenium\webdriver\firefox\x86\geckodriver.exe' binary = FirefoxBinary( 'C:/Program Files (x86)/Mozilla Firefox/firefox.exe') driver = webdriver.Firefox(executable_path=executable_path) # driver = webdriver.Firefox(firefox_binary=binary) print(cfg.getConfigValue("EntryUrl")) driver.get(cfg.getConfigValue("StartingUrl")) # load the web page # look for user name log in: elemUsr = driver.find_element_by_id("member_email") elemUsr.send_keys(strUserName) elemPwd = driver.find_element_by_id("member_pass") elemPwd.send_keys(strPwd) elemPwd.send_keys(Keys.RETURN) #elemNextLnk = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.LINK_TEXT, "Enter Matrix MLS"))) (elemNextLnk, nFailureCnt) = find_wait_get_element(driver, "link_text", "Enter Matrix MLS") window_before = driver.window_handles[0] xpath = "/html[@class='wf-effra-n4-active wf-effra-n7-active wf-effra-n3-active wf-effra-n5-active wf-effra-n9-active wf-active']/body/div[@class='content overlay']/div[@class='container']/div[@class='rightPane']/div[@class='box_simple gray agentbox newhar']/div[@class='box_content grid_view']/a[1]" #elemNextLnk = driver.find_element_by_xpath(xpath) (elemNextLnk, nFailureCnt) = find_wait_get_element(driver, "xpath", xpath, True) #elemNextLnk.click() # switch to the new window, and click on "new listing" wait_for_new_window(driver) window_after = driver.window_handles[1] driver.close() driver.switch_to.window(window_after) strPartialText = "New Listing (" #elemNextLnk = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, strPartialText))) (elemNextLnk, nFailureCnt) = find_wait_get_element(driver, "partial_link_text", strPartialText) driver.get(strEntryUrl) xpChkActive = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/table/tbody/tr/td[2]/table[@class='S_MultiStatus']/tbody/tr[2]/td[1]/div/input[@class='checkbox']" xpChkOptionPending = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/table/tbody/tr/td[2]/table[@class='S_MultiStatus']/tbody/tr[3]/td[1]/div/input[@class='checkbox']" xpChkPendConToShow = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/table/tbody/tr/td[2]/table[@class='S_MultiStatus']/tbody/tr[4]/td[1]/div/input[@class='checkbox']" xpChkPending = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/table/tbody/tr/td[2]/table[@class='S_MultiStatus']/tbody/tr[5]/td[1]/div/input[@class='checkbox']" xpChkSold = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/table/tbody/tr/td[2]/table[@class='S_MultiStatus']/tbody/tr[6]/td[1]/div/input[@class='checkbox']" xpChkWithdrawn = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/table/tbody/tr/td[2]/table[@class='S_MultiStatus']/tbody/tr[7]/td[1]/div/input[@class='checkbox']" xpChkExpired = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/table/tbody/tr/td[2]/table[@class='S_MultiStatus']/tbody/tr[8]/td[1]/div/input[@class='checkbox']" xpChkTerminated = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/table/tbody/tr/td[2]/table[@class='S_MultiStatus']/tbody/tr[9]/td[1]/div/input[@class='checkbox']" xpResults = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='ctl12']/table[@class='buttonBar']/tbody/tr/td[@class='link important barleft'][2]/a[@id='m_ucSearchButtons_m_lbSearch']/span[@class='linkIcon icon_default']" #make sure the page is loaded elemActive = WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.XPATH, xpChkActive))) elemOptionPending = driver.find_element_by_xpath(xpChkOptionPending) elemPendConToShow = driver.find_element_by_xpath(xpChkPendConToShow) elemPending = driver.find_element_by_xpath(xpChkPending) elemSold = driver.find_element_by_xpath(xpChkSold) elemWithdrawn = driver.find_element_by_xpath(xpChkWithdrawn) elemExpired = driver.find_element_by_xpath(xpChkExpired) elemTerminated = driver.find_element_by_xpath(xpChkTerminated) elemResults = driver.find_element_by_xpath(xpResults) #now uncheck Active, Optionpending, etc, and only check Sold Check box if elemActive.is_selected(): elemActive.click() if elemOptionPending.is_selected(): elemOptionPending.click() if elemPendConToShow.is_selected(): elemPendConToShow.click() if elemPending.is_selected(): elemPending.click() if not elemSold.is_selected(): elemSold.click() #now set date range strDateRange = datFrom.strftime("%m/%d/%Y") + "-" + datTo.strftime( "%m/%d/%Y") xpSoldDateRange = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/table/tbody/tr/td[2]/table[@class='S_MultiStatus']/tbody/tr[6]/td[2]/input[@id='FmFm1_Ctrl16_20916_Ctrl16_TB']" elemSoldDateRange = driver.find_element_by_xpath(xpSoldDateRange) elemSoldDateRange.clear() elemSoldDateRange.send_keys(strDateRange) #now click to load the result pages elemResults.click() # now the new listing page is being loaded #below is the xpath to the total # of records xpTotalRecCount = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSubHeader']/div[@id='m_pnlSubHeader']/div/table/tbody/tr/td[@class='css_innerLeft hideOnMap hideOnSearch hideNoResults']/span[@id='m_lblPagingSummary']/b[3]" #elemRecCnt = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, xpTotalRecCount))) (elemRecCnt, nFailureCnt) = find_wait_get_element(driver, "xpath", xpTotalRecCount) try: nRecCnt = int(elemRecCnt.text) except: #TODO: some work to do when the number of records returned >5000 print('exception!') # now click the first listing in the list xpathFirstMLS = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[3]/div[@id='m_upDisplay']/div[@id='m_pnlDisplayTab']/div[@id='m_divContent']/div[@id='m_pnlDisplay']/table[@class='displayGrid nonresponsive ajax_display d1m_show']/tbody/tr[@id='wrapperTable'][1]/td[@class='d1m5']/span[@class='d1m1']/a" nCntTries = 0 (elemFirstMLS, nFailureCnt) = find_wait_get_element(driver, "xpath", xpathFirstMLS) sMLS = elemFirstMLS.text (elemFirstMLS, nFailureCnt) = find_wait_get_element(driver, "xpath", xpathFirstMLS, True) ''' while nCntTries<3: try: #elemFirstMLS = driver.find_element_by_xpath(xpathFirstMLS) (elemFirstMLS, nFailureCnt) = find_wait_get_element(driver, "xpath", xpathFirstMLS) sMLS = elemFirstMLS.text elemFirstMLS.click() break except: driver.refresh() nCntTries+=1 ''' # wait for the details page to load # xPathMLS = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[3]/div[@id='m_upDisplay']/div[@id='m_pnlDisplayTab']/div[@id='m_divContent']/div[@id='m_pnlDisplay']/div[@class='multiLineDisplay ajax_display d3m_show nonresponsive']/table/tbody/tr/td/table[@id='wrapperTable']/tbody/tr/td[@class='d3m1']/span[@class='display']/table[@class='d3m2']/tbody/tr[2]/td[@class='d3m3']/span[@class='formula']/div[@class='multiLineDisplay ajax_display d48m_show nonresponsive']/table[@id='wrapperTable']/tbody/tr/td[@class='d48m1']/span[@class='display']/table[@class='d48m2']/tbody/tr[3]/td[@class='d48m5']/table[@class='d48m7']/tbody/tr[@class='d48m8']/td[@class='d48m16']/table[@class='d48m17']/tbody/tr[3]/td[@class='d48m19']/span[@class='wrapped-field']" xPathNext = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSubHeader']/div[@id='m_pnlSubHeader']/div/table/tbody/tr/td[@class='css_innerLeft hideOnMap hideOnSearch hideNoResults']/span[@id='m_lblPagingSummary']/span[@class='pagingLinks']/a[@id='m_DisplayCore_dpy3']" NextLinkId = 'm_DisplayCore_dpy3' nExceptionCount = 0 lstScrapResults = [] nTotalCount = 0 #now iterate through all the deails pages db = DBMSAccess.MSAccess(r"c:/temp/RealAnalysis.accdb") while nTotalCount < nRecCnt - 1: print("Rec {0} of {1}".format(nTotalCount + 1, nRecCnt)) #time.sleep(1) #while True: #nCntTries = 0 (elemNextLnk, nFailureCnt) = find_wait_get_element(driver, "id", NextLinkId, True) if elemNextLnk is None: return (0, 0) ''' try: elemNextLnk = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.ID, NextLinkId))) break except: if nCntTries < 3: driver.refresh() else: print("encountered error while trying to click the next link") exit() nCntTries +=1 ''' #get the transaction type (rental, residental, etc) #xpTransType = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[3]/div[@id='m_upDisplay']/div[@id='m_pnlDisplayTab']/div[@id='m_divContent']/div[@id='m_pnlDisplay']/div[@class='multiLineDisplay ajax_display d3m_show nonresponsive']/table/tbody/tr/td/table[@id='wrapperTable']/tbody/tr/td[@class='d3m1']/span[@class='display']/table[@class='d3m2']/tbody/tr[2]/td[@class='d3m3']/span[@class='formula']/div[@class='multiLineDisplay ajax_display d82m_show nonresponsive']/table/tbody/tr/td/table[@id='wrapperTable']/tbody/tr/td[@class='d82m1']/span[@class='display']/table[@class='d82m2']/tbody/tr[3]/td[@class='d82m5']/table[@class='d82m7']/tbody/tr[@class='d82m8']/td[@class='d82m15']/table[@class='d82m16']/tbody/tr[@class='d82m24'][1]/td[@class='d82m25']/span[@class='field d82m26']" #elemTransType = driver.find_element_by_xpath(xpTransType) #strTransType = elemTransType.text pageSource = driver.page_source #now get the lat/lon: #first the the current window handle mainWindow = driver.window_handles[0] #next trigger the new map view window #elemViewMap = driver.find_element_by_xpath('//*[@title="View Map"]') (elemViewMap, temp) = find_wait_get_element(driver, "xpath", '//*[@title="View Map"]', True) if not elemViewMap is None: #elemViewMap.click() #switch to the map view window wait_for_new_window(driver) mapWindow = driver.window_handles[1] driver.switch_to.window(mapWindow) #look for the tag with id: m_ucStreetViewService_m_hfParams tagId = "m_ucStreetViewService_m_hfParams" #elemLatLon = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.ID, tagId))) (elemLatLon, temp) = find_wait_get_element(driver, "id", tagId) #elemLatLon = driver.find_element_by_id(tagId) #strip lat/lon: tagText = str(elemLatLon.get_attribute("value")) (lat, lon) = tagText.split("$")[1:3] driver.close() driver.switch_to.window(mainWindow) bNeedToRefreshNext = False else: (lat, lon) = (None, None) bNeedToRefreshNext = True #switch back to the original window dictPageResult = PropScrap.parseDetails(pageSource) if dictPageResult is not None: dictPageResult["Latitude"] = lat dictPageResult["Longitude"] = lon nMLSNum = dictPageResult['MLSNum'] if nMLSNum is not None: lstScrapResults.append(dictPageResult) if db.InsertDictionary("AllPropertyRecords", dictPageResult) == 0: #if insertion fails: print("insertion failed. record: {0}".format( str(dictPageResult))) appendToCSV(nJobId, nMLSNum, str(dictPageResult)) else: db.UpdateTable("AllPropertyRecords", [ "LastUpdate", "FK_JobId" ], [ datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), nJobId ], ["MLSNum"], [int(nMLSNum)], False) db.Committ() #if bNeedToRefreshNext or nFailureCnt>0: # elemNextLnk = find_wait_get_element(driver, "id", NextLinkId) #elemNextLnk.click() (elemNextLnk, nFailureCnt) = find_wait_get_element(driver, "id", NextLinkId, True) nTotalCount += 1 writeToCSV(lstScrapResults) driver.quit() return (nTotalCount, nRecCnt)
db.Committ() #if bNeedToRefreshNext or nFailureCnt>0: # elemNextLnk = find_wait_get_element(driver, "id", NextLinkId) #elemNextLnk.click() (elemNextLnk, nFailureCnt) = find_wait_get_element(driver, "id", NextLinkId, True) nTotalCount += 1 writeToCSV(lstScrapResults) driver.quit() return (nTotalCount, nRecCnt) if __name__ == "__main__": cfg = XmlConfigReader.Config("AllPropScrapper", "DEV") host = cfg.getConfigValue(r'MySQL/host') port = int(cfg.getConfigValue(r"MySQL/port")) user = cfg.getConfigValue(r"MySQL/user") passwd = cfg.getConfigValue(r"MySQL/password") db = cfg.getConfigValue(r"MySQL/DB") db = DBAccess('mysql', host=host, port=port, db_name=db, user_id=user, pwd=passwd) #db = DBMSAccess.MSAccess(r"c:/temp/RealAnalysis.accdb") sql = "SELECT JobId, DateFrom, DateTo FROM JobLog WHERE Status is null" db._cursor.execute(sql)
def scrapAllProperties(db, datFrom, datTo, strPropType, strPropStat, nJobId): cfg = XmlConfigReader.Config("AllPropScrapper", "DEV") strUserName = cfg.getConfigValue("HARUserName") strPwd = cfg.getConfigValue("HARPassword") strEntryUrl = cfg.getConfigValue("EntryUrl") # strUrl = str(cfg.getConfigValue("EntryUrl")) #only in windows: executable_path = r'C:\Python35\selenium\webdriver\firefox\x86\geckodriver.exe' #binary = FirefoxBinary('C:/Program Files (x86)/Mozilla Firefox/firefox.exe') #driver = webdriver.Firefox(executable_path=executable_path) #in linux # driver = webdriver.Firefox(firefox_binary=binary) driver = webdriver.Chrome() print(cfg.getConfigValue("EntryUrl")) driver.get(cfg.getConfigValue("StartingUrl")) # load the web page # look for user name log in: elemUsr = driver.find_element_by_id("member_email") elemUsr.send_keys(strUserName) elemPwd = driver.find_element_by_id("member_pass") elemPwd.send_keys(strPwd) elemPwd.send_keys(Keys.RETURN) (elemNextLnk, nFailureCnt) = find_wait_get_element(driver, "link_text", "Enter Matrix MLS") window_before = driver.window_handles[0] xpath = "/html[@class='wf-effra-n4-active wf-effra-n7-active wf-effra-n3-active wf-effra-n5-active wf-effra-n9-active wf-active']/body/div[@class='content overlay']/div[@class='container']/div[@class='rightPane']/div[@class='box_simple gray agentbox newhar']/div[@class='box_content grid_view']/a[1]" (elemNextLnk, nFailureCnt) = find_wait_get_element(driver, "xpath", xpath, True) time.sleep(3) wait_for_new_window(driver) window_after = driver.window_handles[1] driver.close() driver.switch_to.window(window_after) lstFormInputs = [] # now check/uncheck the property status boxes, and set start/end date values xpChkActive = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/table/tbody/tr/td[2]/table[@class='S_MultiStatus']/tbody/tr[2]/td[1]/div/input[@class='checkbox']" xpChkOP = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/table/tbody/tr/td[2]/table[@class='S_MultiStatus']/tbody/tr[3]/td[1]/div/input[@class='checkbox']" xpChkPCS = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/table/tbody/tr/td[2]/table[@class='S_MultiStatus']/tbody/tr[4]/td[1]/div/input[@class='checkbox']" xpChkPending = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/table/tbody/tr/td[2]/table[@class='S_MultiStatus']/tbody/tr[5]/td[1]/div/input[@class='checkbox']" xpChkSold = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/table/tbody/tr/td[2]/table[@class='S_MultiStatus']/tbody/tr[6]/td[1]/div/input[@class='checkbox']" xpInputActive = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/table/tbody/tr/td[2]/table[@class='S_MultiStatus']/tbody/tr[2]/td[2]/input[@id='FmFm1_Ctrl16_20915_Ctrl16_TB']" xpInputSold = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/table/tbody/tr/td[2]/table[@class='S_MultiStatus']/tbody/tr[6]/td[2]/input[@id='FmFm1_Ctrl16_20916_Ctrl16_TB']" # now select the property type xpPropTypeRes = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[5]/td/table/tbody/tr[2]/td[2]/table/tbody/tr[1]/td[2]/select[@id='Fm1_Ctrl129_LB']/option[1]" xpPropTypeCnd = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[5]/td/table/tbody/tr[2]/td[2]/table/tbody/tr[1]/td[2]/select[@id='Fm1_Ctrl129_LB']/option[2]" xpPropTypeLnd = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[5]/td/table/tbody/tr[2]/td[2]/table/tbody/tr[1]/td[2]/select[@id='Fm1_Ctrl129_LB']/option[3]" xpPropTypeRnt = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSearch']/div[@id='m_pnlSearchTab']/div[@id='m_pnlSearch']/div[@class='css_content']/div[@id='m_sfcSearch']/div[@class='searchForm']/table/tbody/tr/td/table/tbody/tr[2]/td[1]/table/tbody/tr[5]/td/table/tbody/tr[2]/td[2]/table/tbody/tr[1]/td[2]/select[@id='Fm1_Ctrl129_LB']/option[5]" strDtRange = datFrom.strftime("%m/%d/%Y") + '-' + datTo.strftime( "%m/%d/%Y") lstFormInputs.append((xpChkActive, 'CheckBox', True)) lstFormInputs.append((xpChkOP, 'CheckBox', True)) lstFormInputs.append((xpChkPCS, 'CheckBox', True)) lstFormInputs.append((xpChkPending, 'CheckBox', True)) lstFormInputs.append((xpChkSold, 'CheckBox', True)) lstFormInputs.append((xpInputActive, 'TextBox', strDtRange)) lstFormInputs.append((xpChkSold, 'TextBox', '')) lstFormInputs.append((xpPropTypeRes, 'ListItem', True)) lstFormInputs.append((xpPropTypeCnd, 'ListItem', True)) lstFormInputs.append((xpPropTypeLnd, 'ListItem', True)) lstFormInputs.append((xpPropTypeRnt, 'ListItem', True)) queryAllPropClassicPage(driver, lstFormInputs) #now the result page loads xpRecordCount = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[@id='m_upSubHeader']/div[@id='m_pnlSubHeader']/div/table/tbody/tr/td[@class='css_innerLeft hideOnMap hideOnSearch hideNoResults']/span[@id='m_lblPagingSummary']/b[3]" elemTotRecCnt = WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.XPATH, xpRecordCount))) try: nRecCnt = int(elemTotRecCnt.text) except: # TODO: some work to do when the number of records returned >5000 print('exception!') # now click the first listing in the list xpathFirstMLS = "/html/body/form[@id='Form1']/div[@class='stickywrapper']/div[@class='tier3']/table/tbody/tr/td/div[@class='css_container']/div[3]/div[@id='m_upDisplay']/div[@id='m_pnlDisplayTab']/div[@id='m_divContent']/div[@id='m_pnlDisplay']/table[@class='displayGrid nonresponsive ajax_display d1m_show']/tbody/tr[@id='wrapperTable'][1]/td[@class='d1m5']/span[@class='d1m1']/a" nCntTries = 0 (elemFirstMLS, nFailureCnt) = find_wait_get_element(driver, "xpath", xpathFirstMLS) sMLS = elemFirstMLS.text elemFirstMLS.click() NextLinkId = 'm_DisplayCore_dpy3' nTotalCount = 0 lstScrapResults = [] while nTotalCount < nRecCnt: print("Rec {0} of {1}".format(nTotalCount + 1, nRecCnt)) (elemNextLnk, nFailureCnt) = find_wait_get_element(driver, "id", NextLinkId, True) if elemNextLnk is None: return (0, 0) pageSource = driver.page_source # now get the lat/lon: # first the the current window handle mainWindow = driver.window_handles[0] # next trigger the new map view window # elemViewMap = driver.find_element_by_xpath('//*[@title="View Map"]') (elemViewMap, temp) = find_wait_get_element(driver, "xpath", '//*[@title="View Map"]', True) if not elemViewMap is None: # switch to the map view window wait_for_new_window(driver) mapWindow = driver.window_handles[1] driver.switch_to.window(mapWindow) # look for the tag with id: m_ucStreetViewService_m_hfParams tagId = "m_ucStreetViewService_m_hfParams" # elemLatLon = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.ID, tagId))) (elemLatLon, temp) = find_wait_get_element(driver, "id", tagId) # elemLatLon = driver.find_element_by_id(tagId) # strip lat/lon: tagText = str(elemLatLon.get_attribute("value")) (lat, lon) = tagText.split("$")[1:3] driver.close() driver.switch_to.window(mainWindow) bNeedToRefreshNext = False else: (lat, lon) = (None, None) bNeedToRefreshNext = True # switch back to the original window dictPageResult = PropScrap.parseDetails(pageSource) if dictPageResult is not None: dictPageResult["Latitude"] = lat dictPageResult["Longitude"] = lon nMLSNum = dictPageResult['MLSNum'] if nMLSNum is not None: lstScrapResults.append(dictPageResult) if db.TransferDictionary("Matrix_AllPropRecords", dictPageResult) == 1: nTotalCount += 1 else: # if insertion fails: print("insertion failed. record: {0}".format( str(dictPageResult))) appendToCSV(nJobId, nMLSNum, str(dictPageResult)) nTotalCount += 1
for col in header: print(col) try: #print(row[col]) newRow.append(row[col]) except: print('key {0} not found in result'.format(col)) newRow.append(None) aryValues.append(newRow) with open(r"c:\temp\output.csv", 'w') as resultFile: wr = csv.writer(resultFile, dialect='excel') wr.writerows(aryValues) print('done') if __name__ == "__main__": cfg = XmlConfigReader.Config("NewListingScrapper","DEV") strUserName = cfg.getConfigValue("UserName") strPwd = cfg.getConfigValue("Password") #strUrl = str(cfg.getConfigValue("EntryUrl")) executable_path = r'C:\Python35\selenium\webdriver\firefox\x86\geckodriver.exe' binary = FirefoxBinary('C:/Program Files (x86)/Mozilla Firefox/firefox.exe') driver = webdriver.Firefox(executable_path=executable_path) #driver = webdriver.Firefox(firefox_binary=binary) print(cfg.getConfigValue("EntryUrl")) driver.get(cfg.getConfigValue("EntryUrl")) # load the web page #look for user name log in: elemUsr = driver.find_element_by_id("member_email") elemUsr.send_keys(strUserName)
def readAllPropScrapperConfigSections(): cfg = XmlConfigReader.Config("AllPropScrapper", "DEV") # now start retrieve the page section names and column dictionary dictSectionLookup = {} lstSectKeys = [] lstSectDict = [] # general section strGeneral = cfg.getConfigValue( "PageSections/Section[@name='{0}']/SectionString".format("General")) strTemp = cfg.getConfigValue( "PageSections/Section[@name='{0}']/ColumnDictionary".format("General")) dictGeneral = ast.literal_eval(strTemp.strip()) lstSectKeys.append(strGeneral) lstSectDict.append(dictGeneral) dictSectionLookup[strGeneral] = dictGeneral # ListingOffice section strListingOffice = cfg.getConfigValue( "PageSections/Section[@name='{0}']/SectionString".format( "ListingOffice")) strTemp = cfg.getConfigValue( "PageSections/Section[@name='{0}']/ColumnDictionary".format( "ListingOffice")) dictListingOffice = ast.literal_eval(strTemp.strip()) lstSectKeys.append(strListingOffice) lstSectDict.append(dictListingOffice) dictSectionLookup[strListingOffice] = dictListingOffice # SchoolSection strSchool = cfg.getConfigValue( "PageSections/Section[@name='{0}']/SectionString".format("School")) strTemp = cfg.getConfigValue( "PageSections/Section[@name='{0}']/ColumnDictionary".format("School")) dictSchool = ast.literal_eval(strTemp.strip()) lstSectKeys.append(strSchool) lstSectDict.append(dictSchool) dictSectionLookup[strSchool] = dictSchool # Description section strDescription = cfg.getConfigValue( "PageSections/Section[@name='{0}']/SectionString".format( "Description")) strTemp = cfg.getConfigValue( "PageSections/Section[@name='{0}']/ColumnDictionary".format( "Description")) dictDescription = ast.literal_eval(strTemp.strip()) lstSectKeys.append(strDescription) lstSectDict.append(dictDescription) dictSectionLookup[strDescription] = dictDescription # Rooms section ''' strRooms = cfg.getConfigValue("PageSections/Section[@name='{0}']/SectionString".format("Rooms")) strTemp = cfg.getConfigValue("PageSections/Section[@name='{0}']/ColumnDictionary".format("Rooms")) dictRooms = ast.literal_eval(strTemp.strip()) lstSectKeys.append(strRooms) lstSectDict.append(dictRooms) ''' # Additional section strAdditional = cfg.getConfigValue( "PageSections/Section[@name='{0}']/SectionString".format("Additional")) strTemp = cfg.getConfigValue( "PageSections/Section[@name='{0}']/ColumnDictionary".format( "Additional")) dictAdditional = ast.literal_eval(strTemp.strip()) lstSectKeys.append(strAdditional) lstSectDict.append(dictAdditional) dictSectionLookup[strAdditional] = dictAdditional # LeaseAdditinal strLeaseAdditional = cfg.getConfigValue( "PageSections/Section[@name='{0}']/SectionString".format( "LeaseAdditinal")) strTemp = cfg.getConfigValue( "PageSections/Section[@name='{0}']/ColumnDictionary".format( "LeaseAdditinal")) dictLeaseAdditional = ast.literal_eval(strTemp.strip()) lstSectKeys.append(strLeaseAdditional) lstSectDict.append(dictLeaseAdditional) dictSectionLookup[strLeaseAdditional] = dictLeaseAdditional # Financial section strFinancial = cfg.getConfigValue( "PageSections/Section[@name='{0}']/SectionString".format("Financial")) strTemp = cfg.getConfigValue( "PageSections/Section[@name='{0}']/ColumnDictionary".format( "Financial")) dictFinancial = ast.literal_eval(strTemp.strip()) lstSectKeys.append(strFinancial) lstSectDict.append(dictFinancial) dictSectionLookup[strFinancial] = dictFinancial # Pending section strPending = cfg.getConfigValue( "PageSections/Section[@name='{0}']/SectionString".format("Pending")) strTemp = cfg.getConfigValue( "PageSections/Section[@name='{0}']/ColumnDictionary".format("Pending")) dictPending = ast.literal_eval(strTemp.strip()) lstSectKeys.append(strPending) lstSectDict.append(dictPending) dictSectionLookup[strPending] = dictPending # Sold section strSold = cfg.getConfigValue( "PageSections/Section[@name='{0}']/SectionString".format("Sold")) strTemp = cfg.getConfigValue( "PageSections/Section[@name='{0}']/ColumnDictionary".format("Sold")) dictSold = ast.literal_eval(strTemp.strip()) lstSectKeys.append(strSold) lstSectDict.append(dictSold) dictSectionLookup[strSold] = dictSold # LeasedInformation strLeasedInformation = cfg.getConfigValue( "PageSections/Section[@name='{0}']/SectionString".format( "LeasedInformation")) strTemp = cfg.getConfigValue( "PageSections/Section[@name='{0}']/ColumnDictionary".format( "LeasedInformation")) dictLeasedInformation = ast.literal_eval(strTemp.strip()) lstSectKeys.append(strLeasedInformation) lstSectDict.append(dictLeasedInformation) dictSectionLookup[strLeasedInformation] = dictLeasedInformation return (lstSectKeys, lstSectDict, dictSectionLookup)
import geocoder import pymysql import traceback import datetime import XmlConfigReader from censusgeocode import CensusGeocode cfg = XmlConfigReader.Config("AddrGeocoder", 'DEV') '''geocode with bing, returns a tuple with the format: lat, lon, strFullAddr, neighborhood, geocoderName, quality, accuracy, ''' def replaceNone(str): if str is None: return '' else: return str def GeoCode(GeoCoder, strAddr): strBingMapKey = cfg.getConfigValue(r"Geocoder/BingKey") #strBingMapKey = 'AjlU0VglpeaGSVjfdrvFNEEZKSRWLtUYbDGGBbkVq1SsFK6Vz724WpqxqRi2m8SJ' try: if GeoCoder == 'google': g = geocoder.google(strAddr) return (g.lat, g.lng, g.address, GeoCoder, g.neighborhood, g.quality, g.accuracy, None) elif GeoCoder == 'bing': g = geocoder.bing(strAddr, key=strBingMapKey)