def parse_details(htmlstring, driver1): data_base = [] try: brand_name = driver1.find_element_by_xpath( "//h1[contains(@class, 'sc-csuQGl') and contains(@class, 'icyJZl')]" ).text title = driver1.find_element_by_xpath( "//h4[contains(@class, 'sc-gipzik')]").text xpath_infos = driver1.find_elements_by_xpath( "//span[contains(@class, 'StyledText-sc-1sadyjn-0') and contains(@class, 'bVvIwM')]" ) planet = xpath_infos[0].text people = xpath_infos[1].text animals = xpath_infos[2].text description = driver1.find_element_by_xpath( "//div[contains(@class, 'sc-hzDkRC') and contains(@class, 'giRCDN')]" ).text overall_rating = driver1.find_element_by_xpath( "//h2[contains(@class, 'sc-bRBYWo')]").text overall_rating = overall_rating.replace("Overall rating: ", "") print("Brand-------------> : ", brand_name) print("Title-------------> : ", title) print("Planet------------> : ", planet) print("People------------> : ", people) print("Animals-----------> : ", animals) print("Overall Rating----> : ", overall_rating) print("Description-------> : ", description) string_identify = brand_name + title + planet + people + animals + overall_rating m = hashlib.md5() m.update(string_identify.encode('utf8')) identifier = m.hexdigest() create_time = str(datetime.datetime.now()) update_time = "" insertdb = InsertDB() data_base.append( (brand_name, title, planet, people, animals, overall_rating, description, identifier, create_time, update_time)) insertdb.insert_document(data_base, table_name) except: print("Continue")
def main(htmlstring, driver): table_name = "maricopa" header = { 'accept': '*/*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en-US,en;q=0.9,ko;q=0.8', 'referer': 'https://www.zillow.com/homes/85139_rb/', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' } first_url = "https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%2285233%22%2C%22mapBounds%22%3A%7B%22west%22%3A-111.87922703222657%2C%22east%22%3A-111.75151096777344%2C%22south%22%3A33.286508479539734%2C%22north%22%3A33.40610372761989%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A94827%2C%22regionType%22%3A7%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22mapZoom%22%3A13%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%7D&includeMap=false&includeList=true" default_url = "https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=%7B%22pagination%22%3A%7B%22currentPage%22%3A{}%7D%2C%22usersSearchTerm%22%3A%2285233%22%2C%22mapBounds%22%3A%7B%22west%22%3A-111.87922703222657%2C%22east%22%3A-111.75151096777344%2C%22south%22%3A33.286508479539734%2C%22north%22%3A33.40610372761989%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A94827%2C%22regionType%22%3A7%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22mapZoom%22%3A13%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%7D&includeMap=false&includeList=true" counts = 1 for page in range(1, 3): if page == 1: url = first_url else: url = default_url.format(page) response = requests.get(url, headers=header) result = response.json() properties_infos = result["searchResults"]["listResults"] print(len(properties_infos)) for i in range(0, len(properties_infos)): data_base = [] property_url = properties_infos[i]["detailUrl"] status_text = properties_infos[i]["statusText"] print(status_text, counts) counts += 1 try: street_add = properties_infos[i]["hdpData"]["homeInfo"][ "streetAddress"] except: street_add = "" try: city = properties_infos[i]["hdpData"]["homeInfo"]["city"] except: city = "" try: state = properties_infos[i]["hdpData"]["homeInfo"]["state"] except: state = "" try: zipcode = properties_infos[i]["hdpData"]["homeInfo"]["zipcode"] except: zipcode = "" property_address = street_add + ", " + city + ", " + state + " " + zipcode if "by owner" in status_text: print("--------------------------------------------------> : ", i + 1) print("Property Address--------------------> : ", property_address) print("Property Url------------------------> : ", property_url) print("Property Status---------------------> : ", status_text) driver.get(property_url) time.sleep(10) # phone_number = driver.find_element_by_xpath("//span[@class='listing-field']").text phones = re.findall(r'[(][\d]{3}[)][ ]?[\d]{3}-[\d]{4}', driver.page_source) for phone in range(1, len(phones) + 1): phone_number = phones[phone - 1] print("Owner Phone Number------------------> : ", phone_number) string_id = property_address + status_text + phone_number m = hashlib.md5() m.update(string_id.encode('utf8')) identifier = m.hexdigest() print("hash-------------------->", identifier) create_time = str(datetime.datetime.now()) update_time = "" insertdb = InsertDB() data_base.append((property_address, street_add, city, state, zipcode, status_text, phone_number, identifier, create_time, update_time)) insertdb.insert_document(data_base, table_name)
def main(htmlstring, driver): table_name = "maricopa_30_08_2020" header = { 'accept': '*/*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en-US,en;q=0.9,ko;q=0.8', 'cookie': 'zguid=23|%2410ab80e6-80db-4e0a-9f70-2449ca972d74; _ga=GA1.2.759159145.1599348167; zjs_user_id=null; zjs_anonymous_id=%2210ab80e6-80db-4e0a-9f70-2449ca972d74%22; _gcl_au=1.1.607943717.1599348169; _pxvid=be9ff2f0-efce-11ea-9652-0242ac12000b; __gads=ID=cab593cad6cbce43:T=1599348200:S=ALNI_MaFYrYCZZvPIITKUEoEDXGvXSRYwQ; _gid=GA1.2.1287304564.1599556314; _pin_unauth=dWlkPU9EUXdZamxrTldJdE9ESTBNUzAwWXprMExXSXdNekl0TkdWak0yWTFNVEE1TldJeSZycD1abUZzYzJV; ki_r=; ki_s=; _fbp=fb.1.1599562363584.1440832488; g_state={"i_p":1599570378147,"i_l":1}; ki_t=1599556892885%3B1599556892885%3B1599563330503%3B1%3B19; JSESSIONID=62F47C1DAFBF00B3DB7B301BEA3E6586; zgsession=1|8840c1ee-f8a6-43d7-9a7b-3169df33c987; _pxff_cc=U2FtZVNpdGU9TGF4Ow==; _pxff_rf=1; _pxff_fp=1; _pxff_bsco=1; _px3=6d722620cec81d0df86c8eff4b631bdd93cef163fb0a14808e80f81013747454:M7trNae6CpAztMArZT97P3Vy9jFLz9FuEZ5p2efYpXeqOJC7Bw+xzsVGxArAYe+PM+vQKNuEI3qytjutx2UEXg==:1000:M1Vo/kdU1lI8Zqky6jJnuwSu45xHxX8ueCLKUiW6KX8rNR+VWAORLQi+1ns4dhilOU7gSCJfJmToj1SeyKN49kHZQZIQ0wSFeFtn+txzkIo/fhFAr2Cq7WvjCVWw7GBx8F3JIjMqHf1BZAAFg0YXqy/IVuCFhvIioSyK35nkm4A=; _gat=1; KruxPixel=true; DoubleClickSession=true; _uetsid=f44fc66ca5c392a6859170ed776b6ae9; _uetvid=dc708dafb2b6d91ab6c6923ac1ae6673; AWSALB=3gLhoP6QCdmf4zskymQ7ej/kbqzRHNkv+QNQMFmS6Y7S9pENaOusdnQVhFHWm1W9z8/1Og/WmO8JK63ys0wmi6ZNwRc4SN8lf4pcoyrm+nj8lLAPLRDIqMaYAEte; AWSALBCORS=3gLhoP6QCdmf4zskymQ7ej/kbqzRHNkv+QNQMFmS6Y7S9pENaOusdnQVhFHWm1W9z8/1Og/WmO8JK63ys0wmi6ZNwRc4SN8lf4pcoyrm+nj8lLAPLRDIqMaYAEte; search=6|1602203173818%7Crb%3DMaricopa%252C-AZ%26rect%3D33.203401%252C-111.882231%252C32.788612%252C-112.512953%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26pt%3Dpmf%252Cpf%26fs%3D1%26fr%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%09%0932697%09%09%09%09%09%09', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' } first_url = "https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%2285298%22%2C%22mapBounds%22%3A%7B%22west%22%3A-111.79299953222655%2C%22east%22%3A-111.66528346777342%2C%22south%22%3A33.17769548749086%2C%22north%22%3A33.29743987631964%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A399659%2C%22regionType%22%3A7%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22mapZoom%22%3A13%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%7D&includeMap=false&includeList=true" default_url = "https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=%7B%22pagination%22%3A%7B%22currentPage%22%3A{}%7D%2C%22usersSearchTerm%22%3A%2285298%22%2C%22mapBounds%22%3A%7B%22west%22%3A-111.79299953222655%2C%22east%22%3A-111.66528346777342%2C%22south%22%3A33.17769548749086%2C%22north%22%3A33.29743987631964%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A399659%2C%22regionType%22%3A7%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22mapZoom%22%3A13%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%7D&includeMap=false&includeList=true" counts = 1 for page in range(1, 5): if page == 1: url = first_url else: url = default_url.format(page) response = requests.get(url, headers=header) result = response.json() properties_infos = result["searchResults"]["listResults"] print(len(properties_infos)) for i in range(0, len(properties_infos)): data_base = [] property_url = properties_infos[i]["detailUrl"] status_text = properties_infos[i]["statusText"] print(status_text, counts) counts += 1 try: street_add = properties_infos[i]["hdpData"]["homeInfo"][ "streetAddress"] except: street_add = "" try: city = properties_infos[i]["hdpData"]["homeInfo"]["city"] except: city = "" try: state = properties_infos[i]["hdpData"]["homeInfo"]["state"] except: state = "" try: zipcode = properties_infos[i]["hdpData"]["homeInfo"]["zipcode"] except: zipcode = "" property_address = street_add + ", " + city + ", " + state + " " + zipcode try: bathrooms = properties_infos[i]["hdpData"]["homeInfo"][ "bathrooms"] except: bathrooms = "" try: bedrooms = properties_infos[i]["hdpData"]["homeInfo"][ "bedrooms"] except: bedrooms = "" try: tax_assessed_value = properties_infos[i]["hdpData"][ "homeInfo"]["taxAssessedValue"] except: tax_assessed_value = "" try: zestimate = properties_infos[i]["hdpData"]["homeInfo"][ "zestimate"] except: zestimate = "" try: rent_zestimate = properties_infos[i]["hdpData"]["homeInfo"][ "rentZestimate"] except: rent_zestimate = "" try: home_type = properties_infos[i]["hdpData"]["homeInfo"][ "homeType"] except: home_type = "" if "by owner" in status_text: print("--------------------------------------------------> : ", i + 1) driver.get(property_url) time.sleep(10) try: wait(driver, "//ul[@class='ds-home-fact-list']") except: print("There is no xpath") # street_add = driver.find_element_by_xpath("//h1[@class='ds-address-container']/span[1]").text # property_address = street_add + ", " + city + ", " + state + " " + zipcode # phone_number = driver.find_element_by_xpath("//span[@class='listing-field']").text phones = re.findall(r'[(][\d]{3}[)][ ]?[\d]{3}-[\d]{4}', driver.page_source) for phone in range(1, len(phones) + 1): phone_number = phones[phone - 1] features_labels = driver.find_elements_by_xpath( "//ul[@class='ds-home-fact-list']//span[contains(@class, 'ds-standard-label') and contains(@class, 'ds-home-fact-label')]" ) features_infos = driver.find_elements_by_xpath( "//ul[@class='ds-home-fact-list']//span[contains(@class, 'ds-body') and contains(@class, 'ds-home-fact-value')]" ) parking = "" year_built = "" hoa = "" heating = "" lot = "" cooling = "" price_sqft = "" for feature_label, feature_info in zip(features_labels, features_infos): feature_label_txt = feature_label.text if 'Parking' in feature_label_txt: parking = feature_info.text elif 'Year built' in feature_label_txt: year_built = feature_info.text elif 'HOA' in feature_label_txt: hoa = feature_info.text elif 'Heating' in feature_label_txt: heating = feature_info.text elif 'Lot' in feature_label_txt: lot = feature_info.text elif 'Cooling' in feature_label_txt: cooling = feature_info.text elif 'Price/' in feature_label_txt: price_sqft = feature_info.text print("Property Address--------------------> : ", property_address) print("Property Url------------------------> : ", property_url) print("Property Status---------------------> : ", status_text) print("Owner Phone Number------------------> : ", phone_number) print("BathRooms---------------------------> : ", bathrooms) print("BedRooms----------------------------> : ", bedrooms) print("Tax Assessed Value------------------> : ", tax_assessed_value) print("Zestimate---------------------------> : ", zestimate) print("Rent Zestimate----------------------> : ", rent_zestimate) print("Home Type---------------------------> : ", home_type) print("Parking-----------------------------> : ", parking) print("Year Built--------------------------> : ", year_built) print("HOA---------------------------------> : ", hoa) print("Heating-----------------------------> : ", heating) print("Lot---------------------------------> : ", lot) print("Cooling-----------------------------> : ", cooling) print("Price Sqft--------------------------> : ", price_sqft) string_id = property_address + status_text + phone_number m = hashlib.md5() m.update(string_id.encode('utf8')) identifier = m.hexdigest() print("hash-------------------->", identifier) create_time = str(datetime.datetime.now()) update_time = "" insertdb = InsertDB() data_base.append( (property_address, street_add, city, state, zipcode, status_text, phone_number, bathrooms, bedrooms, tax_assessed_value, zestimate, rent_zestimate, home_type, parking, year_built, hoa, heating, lot, cooling, price_sqft, identifier, create_time, update_time)) insertdb.insert_document(data_base, table_name)
def main(htmlstring, driver): table_name = "maricopa" header = { 'accept': '*/*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en-US,en;q=0.9,ko;q=0.8', 'referer': 'https://www.zillow.com/homes/85139_rb/', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' } pagination = "" usersSearchTerm = "85018" west = "-112.03634614855957" east = "-111.92699785144043" south = "33.45540259905828" north = "33.574765505875604" regionId = "94734" regionType = "7" mapZoom = "13" includeList = "true" # https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={%22pagination%22:{},%22usersSearchTerm%22:%2285006%22,%22mapBounds%22:{%22west%22:-112.07973577801513,%22east%22:-112.01665022198486,%22south%22:33.43522122804253,%22north%22:33.494937169247144},%22regionSelection%22:[{%22regionId%22:94722,%22regionType%22:7}],%22isMapVisible%22:true,%22mapZoom%22:14,%22filterState%22:{%22sort%22:{%22value%22:%22globalrelevanceex%22}},%22isListVisible%22:true}&includeMap=false&includeList=true default_first_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{0},"usersSearchTerm":"{1}","mapBounds":{"west":{2},"east":{3},"south":{4},"north":{5}},"regionSelection":[{"regionId":{6},"regionType":{7}}],"isMapVisible":true,"mapZoom":{8},"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList={9}' first_case_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{' + pagination + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList=' + includeList # first_url = default_first_url.format(pagination, usersSearchTerm, west, east, south, north, regionId, regionType, mapZoom, includeList) print(first_case_url) # return default_page_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":' + pagination + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList=' + includeList counts = 1 for page in range(1, 7): default_page_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":' + str(page) + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList=' + includeList if page == 1: url = first_case_url else: url = default_page_url response = requests.get(url, headers=header) result = response.json() properties_infos = result["searchResults"]["listResults"] print(len(properties_infos)) for i in range(0, len(properties_infos)): data_base = [] property_url = properties_infos[i]["detailUrl"] status_text = properties_infos[i]["statusText"] print(status_text, counts) counts += 1 try: street_add = properties_infos[i]["hdpData"]["homeInfo"]["streetAddress"] except: street_add = "" try: city = properties_infos[i]["hdpData"]["homeInfo"]["city"] except: city = "" try: state = properties_infos[i]["hdpData"]["homeInfo"]["state"] except: state = "" try: zipcode = properties_infos[i]["hdpData"]["homeInfo"]["zipcode"] except: zipcode = "" property_address = street_add + ", " + city + ", " + state + " " + zipcode if "by owner" in status_text: print("--------------------------------------------------> : ", i + 1) driver.get(property_url) time.sleep(10) # street_add = driver.find_element_by_xpath("//h1[@class='ds-address-container']/span[1]").text # property_address = street_add + ", " + city + ", " + state + " " + zipcode # phone_number = driver.find_element_by_xpath("//span[@class='listing-field']").text phones = re.findall(r'[(][\d]{3}[)][ ]?[\d]{3}-[\d]{4}', driver.page_source) for phone in range(1, len(phones) + 1): phone_number = phones[phone - 1] print("Property Address--------------------> : ", property_address) print("Property Url------------------------> : ", property_url) print("Property Status---------------------> : ", status_text) print("Owner Phone Number------------------> : ", phone_number) string_id = property_address + status_text + phone_number m = hashlib.md5() m.update(string_id.encode('utf8')) identifier = m.hexdigest() print("hash-------------------->", identifier) create_time = str(datetime.datetime.now()) update_time = "" insertdb = InsertDB() data_base.append((property_address, street_add, city, state, zipcode, status_text, phone_number, identifier, create_time, update_time)) insertdb.insert_document(data_base, table_name)
def main(htmlstring, driver): table_name = "maricopa_30_08_2020" header = { 'accept': '*/*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en-US,en;q=0.9,ko;q=0.8', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' } pagination = "" usersSearchTerm = "Maricopa County, AZ" west = "-112.42886871875002" east = "-110.68204254687502" south = "33.142087182945815" north = "35.173746183166216" regionId = "2402" regionType = "4" mapZoom = "9" includeList = "true" priceMax = "1000" monthlyPay = "3" # https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={%22pagination%22:{},%22usersSearchTerm%22:%2285006%22,%22mapBounds%22:{%22west%22:-112.07973577801513,%22east%22:-112.01665022198486,%22south%22:33.43522122804253,%22north%22:33.494937169247144},%22regionSelection%22:[{%22regionId%22:94722,%22regionType%22:7}],%22isMapVisible%22:true,%22mapZoom%22:14,%22filterState%22:{%22sort%22:{%22value%22:%22globalrelevanceex%22}},%22isListVisible%22:true}&includeMap=false&includeList=true default_first_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{0},"usersSearchTerm":"{1}","mapBounds":{"west":{2},"east":{3},"south":{4},"north":{5}},"regionSelection":[{"regionId":{6},"regionType":{7}}],"isMapVisible":true,"mapZoom":{8},"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList={9}' first_case_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{' + pagination + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"},"price":{"max":' + priceMax + '},"monthlyPayment":{"max":' + monthlyPay + '}},"isListVisible":true}&includeMap=false&includeList=' + includeList # first_url = default_first_url.format(pagination, usersSearchTerm, west, east, south, north, regionId, regionType, mapZoom, includeList) print(first_case_url) # return default_page_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":' + pagination + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"},"price":{"max":' + priceMax + '},"monthlyPayment":{"max":' + monthlyPay + '}},"isListVisible":true}&includeMap=false&includeList=' + includeList counts = 1 for page in range(1, 15): default_page_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":' + str( page ) + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"},"price":{"max":' + priceMax + '},"monthlyPayment":{"max":' + monthlyPay + '}},"isListVisible":true}&includeMap=false&includeList=' + includeList if page == 1: url = first_case_url else: url = default_page_url print(url) response = requests.get(url, headers=header) result = response.json() print(result) properties_infos = result["searchResults"]["listResults"] print(len(properties_infos)) for i in range(0, len(properties_infos)): data_base = [] property_url = properties_infos[i]["detailUrl"] status_text = properties_infos[i]["statusText"] print(status_text, counts) counts += 1 try: street_add = properties_infos[i]["hdpData"]["homeInfo"][ "streetAddress"] except: street_add = "" try: city = properties_infos[i]["hdpData"]["homeInfo"]["city"] except: city = "" try: state = properties_infos[i]["hdpData"]["homeInfo"]["state"] except: state = "" try: zipcode = properties_infos[i]["hdpData"]["homeInfo"]["zipcode"] except: zipcode = "" property_address = street_add + ", " + city + ", " + state + " " + zipcode try: bathrooms = properties_infos[i]["hdpData"]["homeInfo"][ "bathrooms"] except: bathrooms = "" try: bedrooms = properties_infos[i]["hdpData"]["homeInfo"][ "bedrooms"] except: bedrooms = "" try: tax_assessed_value = properties_infos[i]["hdpData"][ "homeInfo"]["taxAssessedValue"] except: tax_assessed_value = "" try: zestimate = properties_infos[i]["hdpData"]["homeInfo"][ "zestimate"] except: zestimate = "" try: rent_zestimate = properties_infos[i]["hdpData"]["homeInfo"][ "rentZestimate"] except: rent_zestimate = "" try: home_type = properties_infos[i]["hdpData"]["homeInfo"][ "homeType"] except: home_type = "" if "by owner" in status_text: print("--------------------------------------------------> : ", i + 1) driver.get(property_url) time.sleep(10) # street_add = driver.find_element_by_xpath("//h1[@class='ds-address-container']/span[1]").text # property_address = street_add + ", " + city + ", " + state + " " + zipcode # phone_number = driver.find_element_by_xpath("//span[@class='listing-field']").text phones = re.findall(r'[(][\d]{3}[)][ ]?[\d]{3}-[\d]{4}', driver.page_source) for phone in range(1, len(phones) + 1): phone_number = phones[phone - 1] features_labels = driver.find_elements_by_xpath( "//ul[@class='ds-home-fact-list']//span[contains(@class, 'ds-standard-label') and contains(@class, 'ds-home-fact-label')]" ) features_infos = driver.find_elements_by_xpath( "//ul[@class='ds-home-fact-list']//span[contains(@class, 'ds-body') and contains(@class, 'ds-home-fact-value')]" ) for feature_label, feature_info in zip(features_labels, features_infos): feature_label_txt = feature_label.text if 'Parking' in feature_label_txt: parking = feature_info.text elif 'Year built' in feature_label_txt: year_built = feature_info.text elif 'HOA' in feature_label_txt: hoa = feature_info.text elif 'Heating' in feature_label_txt: heating = feature_info.text elif 'Lot' in feature_label_txt: lot = feature_info.text elif 'Cooling' in feature_label_txt: cooling = feature_info.text elif 'Price/' in feature_label_txt: price_sqft = feature_info.text print("Property Address--------------------> : ", property_address) print("Property Url------------------------> : ", property_url) print("Property Status---------------------> : ", status_text) print("Owner Phone Number------------------> : ", phone_number) print("BathRooms---------------------------> : ", bathrooms) print("BedRooms----------------------------> : ", bedrooms) print("Tax Assessed Value------------------> : ", tax_assessed_value) print("Zestimate---------------------------> : ", zestimate) print("Rent Zestimate----------------------> : ", rent_zestimate) print("Home Type---------------------------> : ", home_type) print("Parking-----------------------------> : ", parking) print("Year Built--------------------------> : ", year_built) print("HOA---------------------------------> : ", hoa) print("Heating-----------------------------> : ", heating) print("Lot---------------------------------> : ", lot) print("Cooling-----------------------------> : ", cooling) print("Price Sqft--------------------------> : ", price_sqft) string_id = property_address + status_text + phone_number m = hashlib.md5() m.update(string_id.encode('utf8')) identifier = m.hexdigest() print("hash-------------------->", identifier) create_time = str(datetime.datetime.now()) update_time = "" insertdb = InsertDB() data_base.append( (property_address, street_add, city, state, zipcode, status_text, phone_number, bathrooms, bedrooms, tax_assessed_value, zestimate, rent_zestimate, home_type, parking, year_built, hoa, heating, lot, cooling, price_sqft, identifier, create_time, update_time)) insertdb.insert_document(data_base, table_name)
def monitor_jobs_crawler(d, d1): global title_val, all_loops, refresh_status nowtime = datetime.now() nowtimeUTC = datetime.utcnow() nowtimestr = nowtime.strftime("%Y-%m-%d %H:%M:%S") nowtimeUTCstr = nowtimeUTC.strftime("%Y-%m-%d %H:%M:%S") print("NowTime--------------- -------> : ", nowtimestr) print("NowUTCTime-------------------> : ", nowtimeUTCstr) counts = 1 moreBtn_click_accounts = 1 result = query_jobs(d) if result is None: print('Error. Result is None') return resultSetTs = result["paging"]["resultSetTs"] print("Next Paging ID--------------------->", resultSetTs) while True: if refresh_status != 0 and counts % 10 == 1: result = query_jobs_second(d, resultSetTs, moreBtn_click_accounts) if result is None: print("Error, Result is None") return moreBtn_click_accounts = moreBtn_click_accounts + 1 for job in result["results"]: print( "------------------------------------------------------------->", counts) if counts >= 50: return nowtime = datetime.now() title = job["title"] if counts == 1: if title_val == title: return first_title = title if all_loops == 1: title_val = first_title print("Store the first project Title: ", title_val) # if all_loops == 1: # if counts == 0: # title_val = title # print("Store the first project Title: ", title_val) if counts != 1: if title_val == title: print(title_val, title, "---------------------> title is the same") title_val = first_title return print("First Job title------------------> ", first_title) print("Val Job title--------------------> ", title_val) createdOn = job["createdOn"] type = job["type"] ciphertext = job["ciphertext"] # description = job["description"] # category2 = job["category2"] # subcategory2 = job["subcategory2"] # skills = job["skills"] duration = job["duration"] shortDuration = job["shortDuration"] engagement = job["engagement"] shortEngagement = job["shortEngagement"] amount = int(job["amount"]["amount"]) recno = job["recno"] uid = job["uid"] client_paymentverification = job["client"][ "paymentVerificationStatus"] cient_country = job["client"]["location"]["country"] totalSpent = job["client"]["totalSpent"] totalReviews = job["client"]["totalReviews"] totalFeedback = job["client"]["totalFeedback"] lastContractPlatform = job["client"]["lastContractPlatform"] lastContractRid = job["client"]["lastContractRid"] lastContractTitle = job["client"]["lastContractTitle"] feedbackText = job["client"]["feedbackText"] try: feedbackText = (feedbackText.split(","))[0] except: feedbackText = "" companyOrgUid = job["client"]["companyOrgUid"] freelancersToHire = job["freelancersToHire"] enterpriseJob = job["enterpriseJob"] tierText = job["tierText"] tier = job["tier"] tierLabel = job["tierLabel"] propoaslTier = job["proposalsTier"] prefFreelancerLocation = job["prefFreelancerLocation"] if len(prefFreelancerLocation) == 0: prefFreelancerLocationText = "" else: for freelancerLocation in prefFreelancerLocation: prefFreelancerLocationText = freelancerLocation + ", " prefFreelancerLocationText = prefFreelancerLocationText[:-2] publishedOn = job["publishedOn"] skills_1 = job["attrs"] skillStr = "" for skill in skills_1: skillStr = skillStr + skill["prettyName"] + ", " limit = len(skillStr) - 2 skillStr = skillStr[:limit] isLocal = job["isLocal"] locations = job["locations"] hourlyBudgetText = job["hourlyBudgetText"] del d1.requests second_query_url = "https://www.upwork.com/jobs/{}".format( ciphertext) d1.get(second_query_url) time.sleep(10) #second_response = requests.get(second_query_url, headers=headers, timeout=10) #src = second_response.text src = d1.page_source soup = BeautifulSoup(src, 'lxml') #print("StatusCode-------------------->", response.status_code) job_posting_status = soup.findAll( "li", {"data-qa": "client-job-posting-stats"}) job_posted_accounts = 0 job_hire_rate = 0 if job_posting_status: job_posted_accounts = job_posting_status[0].find( "strong", { "class": "primary" }).getText() job_posted_accounts = ''.join( job_posted_accounts).strip() if job_posted_accounts else "" job_posted_accounts = int((job_posted_accounts.split(" "))[0]) job_hire_rate = job_posting_status[0].find( "div", { "class": "text-muted" }).getText() job_hire_rate = ''.join( job_hire_rate).strip() if job_hire_rate else "" job_hire_rate = int( (((job_hire_rate.split(","))[0]).split(" "))[0].replace( "%", "")) try: avg_hourly_rate = soup.findAll( "strong", {"data-qa": "client-hourly-rate"})[0].getText() avg_hourly_rate = ''.join( avg_hourly_rate).strip() if avg_hourly_rate else "" avg_hourly_rate = float( ((avg_hourly_rate.split(" "))[0].replace("/hr", "")).replace( "$", "")) except: avg_hourly_rate = str("New Client") try: hire_accounts = soup.findAll( "div", {"data-qa": "client-hires"})[0].getText() hire_accounts = ''.join( hire_accounts).strip() if hire_accounts else "" print("Hire_Accounts---------------------!!!!!!--> : ", hire_accounts) hire_accounts = hire_accounts.split("\n")[0] print("Hire_Accounts---------------------!!!!!!--> : ", hire_accounts) except: hire_accounts = 0 try: client_spent_hours = soup.findAll( "div", {"data-qa": "client-hours"})[0].getText() client_spent_hours = ''.join( client_spent_hours).strip() if client_spent_hours else "" client_spent_hours = int((client_spent_hours.split(" "))[0]) except: client_spent_hours = 0 try: client_created_date = soup.findAll( "li", {"data-qa": "client-contract-date"})[0].find( "small", { "class": "text-muted" }).getText() client_created_date = ''.join( client_created_date).strip() if client_created_date else "" client_created_date = client_created_date.replace( "Member since ", "") except: client_created_date = "Private job" print("Title--------------------> :", title) print("Job URL------------------> :", second_query_url) # print("ProjectCreated Date------> :", createdOn) # print("Job Type-----------------> :", type) # print("JobDetailsUrlid----------> :", ciphertext) # print("Description--------------> :", description) # print("Category2----------------> :", category2) # print("SubCategory2-------------> :", subcategory2) # print("Skills-------------------> :", skills) print("Duration-----------------> :", duration) print("Engagement---------------> :", engagement) print("ShortDuration------------> :", shortDuration) print("ShortEngagement----------> :", shortEngagement) print("Amount-------------------> :", amount) # print("RecNo--------------------> :", recno) # print("Uid----------------------> :", uid) print("ClientPaymentVerifcation-> :", client_paymentverification) print("ClientCountry------------> :", cient_country) print("TotalSpent---------------> :", totalSpent) print("TotalReviews-------------> :", totalReviews) print("TotalFeedback------------> :", totalFeedback) # print("LastContractPlatform-----> :", lastContractPlatform) # print("LastContractRid----------> :", lastContractRid) # print("lastContractTitle--------> :", lastContractTitle) print("FeedbackText-------------> :", feedbackText) # print("CompanyOrgUid------------> :", companyOrgUid) # print("FreelancersToHire--------> :", freelancersToHire) # print("EnterpriseJob------------> :", enterpriseJob) print("TierText-----------------> :", tierText) # print("Tier---------------------> :", tier) # print("Tierlabel----------------> :", tierLabel) # print("PropoaslTier-------------> :", propoaslTier) print("PrefFreelancerLocation---> :", prefFreelancerLocationText) # print("PublishedOn--------------> :", publishedOn) # print("SkillStr-----------------> :", skillStr) # print("IsLocal------------------> :", isLocal) # print("Locations----------------> :", locations) # print("HourlyBudgetText---------> :", hourlyBudgetText) print("Job Posted Accounts------> :", job_posted_accounts) print("Job Hired Accounts-------> :", hire_accounts) print("Job Hire Rate------------> :", job_hire_rate) print("Average Hourly Rate------> :", avg_hourly_rate) print("Member Since-------------> :", client_created_date) insertdb = InsertDB() create_time = str(nowtime) create_time_time = (create_time.split(" "))[1] create_time_time = create_time_time[0:4] data_base = [] level_status = False weekly_hours_status = False period_status = False for level in level_array: if level in tierText: level_status = True try: for weekly_hour in weekly_hours_array: print(weekly_hour, shortEngagement) if weekly_hour in shortEngagement: weekly_hours_status = True print("weekly hours----------------------> True!!!!") break except: weekly_hours_status = False try: for period in period_array: if period in shortDuration: period_status = True except: period_status = False print("Level Status------------------->", level_status) if avg_hourly_rate != "New Client": avg_hourly_rate_text = str(avg_hourly_rate) + "$/hr" else: avg_hourly_rate_text = " " job_hire_rate_text = str(job_hire_rate) + "% hire rate" if avg_hourly_rate != "New Client": print("This client isn't new person on upwork") if avg_hourly_rate >= average_rate: print("평균 지불 hourly rate가 40 이상인 경우:") if ("Intermediate" in tierText or "Expert" in tierText): print("intermediate, expert 레벨의 과제는 과제 기간에 상관없이 현시한다") if amount == 0: budget = "Hourly" else: budget = amount data_base.append( (title, second_query_url, budget, duration, shortEngagement, client_paymentverification, cient_country, totalSpent, feedbackText, prefFreelancerLocationText, tierText[0:6], job_posted_accounts, hire_accounts, avg_hourly_rate_text, client_created_date, create_time_time)) insertdb.insert_document(data_base) elif avg_hourly_rate < average_rate: print("과거의 평균지불레이트 40미만이면 다음의 조건에 맞는 과제들만 현시한다.") if (level_status and weekly_hours_status and period_status) or amount >= fixed_amount: print( "Expert, 10 or 30hours more, 1 month more or 10k more" ) if amount == 0: budget = "Hourly" else: budget = amount data_base.append( (title, second_query_url, budget, duration, shortEngagement, client_paymentverification, cient_country, totalSpent, feedbackText, prefFreelancerLocationText, tierText[0:6], job_posted_accounts, hire_accounts, avg_hourly_rate_text, client_created_date, create_time_time)) insertdb.insert_document(data_base) else: print(level_status, weekly_hours_status, period_status) if (level_status and weekly_hours_status and period_status) or amount >= fixed_amount: print( "Expert, 10 or 30hours more, 1 month more or 10k more") if amount == 0: budget = "Hourly" else: budget = amount data_base.append( (title, second_query_url, budget, duration, shortEngagement, client_paymentverification, cient_country, totalSpent, feedbackText, prefFreelancerLocationText, tierText[0:6], job_posted_accounts, hire_accounts, avg_hourly_rate_text, client_created_date, create_time_time)) insertdb.insert_document(data_base) counts += 1 if all_loops == 1: break refresh_status = refresh_status + 1
def main(htmlstring, driver): table_name = "maricopa" header = { 'accept': '*/*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en-US,en;q=0.9,ko;q=0.8', 'referer': 'https://www.zillow.com/homes/85139_rb/', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' } first_url = "https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%2285004%22%2C%22mapBounds%22%3A%7B%22west%22%3A-112.10311127801512%2C%22east%22%3A-112.04002572198485%2C%22south%22%3A33.42091247402758%2C%22north%22%3A33.48063826771274%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A94720%2C%22regionType%22%3A7%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A14%7D&wants={%22cat1%22:[%22mapResults%22,%22total%22]}&requestId=2" default_url = "https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=%7B%22pagination%22%3A%7B%22currentPage%22%3A2%7D%2C%22usersSearchTerm%22%3A%2285006%22%2C%22mapBounds%22%3A%7B%22west%22%3A-112.07973577801513%2C%22east%22%3A-112.01665022198486%2C%22south%22%3A33.43522122804251%2C%22north%22%3A33.494937169247095%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A94722%2C%22regionType%22%3A7%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A14%7D&wants={%22cat1%22:[%22listResults%22,%22mapResults%22,%22total%22]}&requestId=3" counts = 1 for page in range(1, 4): if page == 1: url = first_url else: url = default_url.format(page) response = requests.get(url, headers=header) result = response.json() properties_infos = result["cat1"]["searchResults"]["mapResults"] print(len(properties_infos)) for i in range(0, len(properties_infos)): data_base = [] property_url = "https://www.zillow.com" + properties_infos[i][ "detailUrl"] status_text = properties_infos[i]["statusText"] print(status_text, counts) counts += 1 try: street_add = properties_infos[i]["hdpData"]["homeInfo"][ "streetAddress"] except: street_add = "" try: city = properties_infos[i]["hdpData"]["homeInfo"]["city"] except: city = "" try: state = properties_infos[i]["hdpData"]["homeInfo"]["state"] except: state = "" try: zipcode = properties_infos[i]["hdpData"]["homeInfo"]["zipcode"] except: zipcode = "" # property_address = street_add + ", " + city + ", " + state + " " + zipcode if "by owner" in status_text: print("--------------------------------------------------> : ", i + 1) driver.get(property_url) time.sleep(10) street_add = driver.find_element_by_xpath( "//h1[@class='ds-address-container']/span[1]").text property_address = street_add + ", " + city + ", " + state + " " + zipcode # phone_number = driver.find_element_by_xpath("//span[@class='listing-field']").text phones = re.findall(r'[(][\d]{3}[)][ ]?[\d]{3}-[\d]{4}', driver.page_source) for phone in range(1, len(phones) + 1): phone_number = phones[phone - 1] print("Property Address--------------------> : ", property_address) print("Property Url------------------------> : ", property_url) print("Property Status---------------------> : ", status_text) print("Owner Phone Number------------------> : ", phone_number) string_id = property_address + status_text + phone_number m = hashlib.md5() m.update(string_id.encode('utf8')) identifier = m.hexdigest() print("hash-------------------->", identifier) create_time = str(datetime.datetime.now()) update_time = "" insertdb = InsertDB() data_base.append((property_address, street_add, city, state, zipcode, status_text, phone_number, identifier, create_time, update_time)) insertdb.insert_document(data_base, table_name)
def parse_google(htmlstring, driver, f): table_name = "ncaa" currentDT = datetime.datetime.now() print("///---------Google Search--------------///") with open('ncaa_flashscore.json') as json_file: items = json.load(json_file) counts = len(items) count = 1 for item in items: data_base = [] event_time = item['event-time'] home_team = item['home-name'] home_score = item['home-score'] away_team = item['away-name'] away_score = item['away-score'] print(home_team, " v ", away_team, "-------------", count) search_key = home_team + " v " + away_team search_google = driver.find_element_by_xpath( "//input[contains(@class, 'gLFyf') and contains(@class, 'gsfi')]" ) search_google.send_keys(search_key) search_google.send_keys(Keys.ENTER) time.sleep(5) try: teamNames = driver.find_elements_by_xpath( "//div[contains(@class, 'liveresults-sports-immersive__team-name-width')]" ) fstName = teamNames[0].text sndName = teamNames[1].text fstScore = driver.find_element_by_xpath( "//div[contains(@class, 'imso_mh__l-tm-sc')]").text sndScore = driver.find_element_by_xpath( "//div[contains(@class, 'imso_mh__r-tm-sc')]").text print(fstName, "<----->", fstScore) print(sndName, "<----->", sndScore) if (home_team in fstName and home_score == fstScore) and ( away_team in sndName and away_score == sndScore): googleMatch = "True" elif (away_team in fstName and away_score == fstScore) and ( home_team in sndName and home_score == sndScore): googleMatch = "True" else: googleMatch = "False" except: try: teamNames = driver.find_elements_by_xpath( "//td[contains(@class, 'liveresults-sports-immersive__match-grid-right-border')]//div[contains(@class, 'ellipsisize') and contains(@class, 'kno-fb-ctx')]/span" ) fstName = teamNames[0].text sndName = teamNames[1].text teamScores = driver.find_elements_by_xpath( "//td[contains(@class, 'liveresults-sports-immersive__match-grid-right-border')]//div[@class='imspo_mt__tt-w']" ) fstScore = teamScores[0].text sndScore = teamScores[1].text print(fstName, "<----->", fstScore) print(sndName, "<----->", sndScore) if (fstName in home_team and home_score == fstScore) and ( sndName in away_team and away_score == sndScore): googleMatch = "True" elif (fstName in away_team and away_score == fstScore) and (sndName in home_team and home_score == sndScore): googleMatch = "True" else: googleMatch = "False" except: googleMatch = "False" try: gameStatus = driver.find_element_by_xpath( "//span[contains(@class, 'imso_mh__ft-mtch') and contains(@class, 'imso-medium-font')]" ).text except: gameStatus = "Not" if "final" in gameStatus.lower(): game_status = "Final" else: game_status = "Future" create_time = str(datetime.datetime.now()) update_time = "" print("---------------------------?????", event_time + home_team + away_team) string_id = event_time + home_team + away_team m = hashlib.md5() m.update(string_id.encode('utf8')) identifier = m.hexdigest() print("hash-------------------->", identifier) insertdb = InsertDB() data_base.append((event_time, home_team, home_score, away_team, away_score, googleMatch, game_status, identifier, create_time, update_time)) insertdb.insert_document(data_base, table_name) info = { "event-time": event_time, "home-name": home_team, "home-score": home_score, "away-name": away_team, "away-score": away_score, "google-matching": googleMatch, "game-status": game_status, "indentifier": identifier, "create-time": create_time, "update-time": update_time } json.dump(info, f) if count != counts: f.write(',\n') search_google1 = driver.find_element_by_xpath( "//input[contains(@class, 'gLFyf') and contains(@class, 'gsfi')]" ) search_google1.clear() count += 1 # return driver.close() driver.quit()
def main(htmlstring, driver): table_name = "maricopa_30_08_2020" solver = CaptchaSolver() header = { 'accept': '*/*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en-US,en;q=0.9,ko;q=0.8', 'cookie': 'zguid=23|%2410ab80e6-80db-4e0a-9f70-2449ca972d74; _ga=GA1.2.759159145.1599348167; zjs_user_id=null; zjs_anonymous_id=%2210ab80e6-80db-4e0a-9f70-2449ca972d74%22; _gcl_au=1.1.607943717.1599348169; _pxvid=be9ff2f0-efce-11ea-9652-0242ac12000b; __gads=ID=cab593cad6cbce43:T=1599348200:S=ALNI_MaFYrYCZZvPIITKUEoEDXGvXSRYwQ; _gid=GA1.2.1287304564.1599556314; _pin_unauth=dWlkPU9EUXdZamxrTldJdE9ESTBNUzAwWXprMExXSXdNekl0TkdWak0yWTFNVEE1TldJeSZycD1abUZzYzJV; ki_r=; ki_s=; _fbp=fb.1.1599562363584.1440832488; g_state={"i_p":1599570378147,"i_l":1}; ki_t=1599556892885%3B1599556892885%3B1599563330503%3B1%3B19; JSESSIONID=62F47C1DAFBF00B3DB7B301BEA3E6586; zgsession=1|8840c1ee-f8a6-43d7-9a7b-3169df33c987; _pxff_cc=U2FtZVNpdGU9TGF4Ow==; _pxff_rf=1; _pxff_fp=1; _pxff_bsco=1; _px3=6d722620cec81d0df86c8eff4b631bdd93cef163fb0a14808e80f81013747454:M7trNae6CpAztMArZT97P3Vy9jFLz9FuEZ5p2efYpXeqOJC7Bw+xzsVGxArAYe+PM+vQKNuEI3qytjutx2UEXg==:1000:M1Vo/kdU1lI8Zqky6jJnuwSu45xHxX8ueCLKUiW6KX8rNR+VWAORLQi+1ns4dhilOU7gSCJfJmToj1SeyKN49kHZQZIQ0wSFeFtn+txzkIo/fhFAr2Cq7WvjCVWw7GBx8F3JIjMqHf1BZAAFg0YXqy/IVuCFhvIioSyK35nkm4A=; _gat=1; KruxPixel=true; DoubleClickSession=true; _uetsid=f44fc66ca5c392a6859170ed776b6ae9; _uetvid=dc708dafb2b6d91ab6c6923ac1ae6673; AWSALB=3gLhoP6QCdmf4zskymQ7ej/kbqzRHNkv+QNQMFmS6Y7S9pENaOusdnQVhFHWm1W9z8/1Og/WmO8JK63ys0wmi6ZNwRc4SN8lf4pcoyrm+nj8lLAPLRDIqMaYAEte; AWSALBCORS=3gLhoP6QCdmf4zskymQ7ej/kbqzRHNkv+QNQMFmS6Y7S9pENaOusdnQVhFHWm1W9z8/1Og/WmO8JK63ys0wmi6ZNwRc4SN8lf4pcoyrm+nj8lLAPLRDIqMaYAEte; search=6|1602203173818%7Crb%3DMaricopa%252C-AZ%26rect%3D33.203401%252C-111.882231%252C32.788612%252C-112.512953%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26pt%3Dpmf%252Cpf%26fs%3D1%26fr%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%09%0932697%09%09%09%09%09%09', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' } pagination = "" usersSearchTerm = "Maricopa County, AZ" west = "-114.00266022265627" east = "-110.50900787890627" south = "32.012669442967976" north = "35.813297084142235" regionId = "2402" regionType = "4" mapZoom = "8" includeList = "true" priceMax = "220000" priceMin = "210000" monthlyPayMax = "784" monthlyPayMin = "648" # https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={%22pagination%22:{},%22usersSearchTerm%22:%2285006%22,%22mapBounds%22:{%22west%22:-112.07973577801513,%22east%22:-112.01665022198486,%22south%22:33.43522122804253,%22north%22:33.494937169247144},%22regionSelection%22:[{%22regionId%22:94722,%22regionType%22:7}],%22isMapVisible%22:true,%22mapZoom%22:14,%22filterState%22:{%22sort%22:{%22value%22:%22globalrelevanceex%22}},%22isListVisible%22:true}&includeMap=false&includeList=true default_first_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{0},"usersSearchTerm":"{1}","mapBounds":{"west":{2},"east":{3},"south":{4},"north":{5}},"regionSelection":[{"regionId":{6},"regionType":{7}}],"isMapVisible":true,"mapZoom":{8},"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList={9}' first_case_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{' + pagination + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"},"price":{"max":' + priceMax + ',"min":' + priceMin + '},"monthlyPayment":{"max":' + monthlyPayMax + ',"min":' + monthlyPayMin + '}},"isListVisible":true}&includeMap=false&includeList=' + includeList # first_url = default_first_url.format(pagination, usersSearchTerm, west, east, south, north, regionId, regionType, mapZoom, includeList) # print(first_case_url) # return default_page_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":' + pagination + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"},"price":{"max":' + priceMax + ',"min":' + priceMin + '},"monthlyPayment":{"max":' + monthlyPayMax + ',"min":' + monthlyPayMin + '}},"isListVisible":true}&includeMap=false&includeList=' + includeList counts = 1 for page in range(1, 9): default_page_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":' + str( page ) + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"},"price":{"max":' + priceMax + ',"min":' + priceMin + '},"monthlyPayment":{"max":' + monthlyPayMax + ',"min":' + monthlyPayMin + '}},"isListVisible":true}&includeMap=false&includeList=' + includeList if page == 1: url = first_case_url else: url = default_page_url response = requests.get(url, headers=header) result = response.json() properties_infos = result["searchResults"]["listResults"] print(len(properties_infos)) for i in range(0, len(properties_infos)): data_base = [] property_url = properties_infos[i]["detailUrl"] status_text = properties_infos[i]["statusText"] print(status_text, counts) counts += 1 try: street_add = properties_infos[i]["hdpData"]["homeInfo"][ "streetAddress"] except: street_add = "" try: city = properties_infos[i]["hdpData"]["homeInfo"]["city"] except: city = "" try: state = properties_infos[i]["hdpData"]["homeInfo"]["state"] except: state = "" try: zipcode = properties_infos[i]["hdpData"]["homeInfo"]["zipcode"] except: zipcode = "" property_address = street_add + ", " + city + ", " + state + " " + zipcode try: bathrooms = properties_infos[i]["hdpData"]["homeInfo"][ "bathrooms"] except: bathrooms = "" try: bedrooms = properties_infos[i]["hdpData"]["homeInfo"][ "bedrooms"] except: bedrooms = "" try: tax_assessed_value = properties_infos[i]["hdpData"][ "homeInfo"]["taxAssessedValue"] except: tax_assessed_value = "" try: zestimate = properties_infos[i]["hdpData"]["homeInfo"][ "zestimate"] except: zestimate = "" try: rent_zestimate = properties_infos[i]["hdpData"]["homeInfo"][ "rentZestimate"] except: rent_zestimate = "" try: home_type = properties_infos[i]["hdpData"]["homeInfo"][ "homeType"] except: home_type = "" if "by owner" in status_text: print("--------------------------------------------------> : ", i + 1) driver.get(property_url) time.sleep(10) try: recaptcha = driver.find_element_by_class_name( "g-recaptcha") recaptchaFlag = True except: recaptchaFlag = False if recaptchaFlag == True: time.sleep(20) # solver = recaptchaV2Proxyless() # solver.set_verbose(1) # solver.set_key('0193c75c69d14245ca25bd5b2217637f') # solver.set_website_url(driver.current_url) # site_key = driver.find_element_by_xpath('//*[@data-sitekey]').get_attribute('data-sitekey') # solver.set_website_key(site_key) # g_response = solver.solve_and_return_solution() # if g_response != 0: # print("g-response : ", g_response) # else: # print("task finished with error : ", solver.error_code) # # time.sleep(5) # driver.find_element_by_class_name("recaptcha-checkbox-border").click # driver.execute_script('document.getElementById("g-recaptcha-response").innerHTML = "{}";'.format(g_response)) # time.sleep(5) # driver.find_element_by_id('recaptcha-verify-button').click() # street_add = driver.find_element_by_xpath("//h1[@class='ds-address-container']/span[1]").text # property_address = street_add + ", " + city + ", " + state + " " + zipcode # phone_number = driver.find_element_by_xpath("//span[@class='listing-field']").text phones = re.findall(r'[(][\d]{3}[)][ ]?[\d]{3}-[\d]{4}', driver.page_source) for phone in range(1, len(phones) + 1): phone_number = phones[phone - 1] features_labels = driver.find_elements_by_xpath( "//ul[@class='ds-home-fact-list']//span[contains(@class, 'ds-standard-label') and contains(@class, 'ds-home-fact-label')]" ) features_infos = driver.find_elements_by_xpath( "//ul[@class='ds-home-fact-list']//span[contains(@class, 'ds-body') and contains(@class, 'ds-home-fact-value')]" ) parking = "" year_built = "" hoa = "" heating = "" lot = "" cooling = "" price_sqft = "" for feature_label, feature_info in zip(features_labels, features_infos): feature_label_txt = feature_label.text if 'Parking' in feature_label_txt: parking = feature_info.text elif 'Year built' in feature_label_txt: year_built = feature_info.text elif 'HOA' in feature_label_txt: hoa = feature_info.text elif 'Heating' in feature_label_txt: heating = feature_info.text elif 'Lot' in feature_label_txt: lot = feature_info.text elif 'Cooling' in feature_label_txt: cooling = feature_info.text elif 'Price/' in feature_label_txt: price_sqft = feature_info.text print("Property Address--------------------> : ", property_address) print("Property Url------------------------> : ", property_url) print("Property Status---------------------> : ", status_text) print("Owner Phone Number------------------> : ", phone_number) print("BathRooms---------------------------> : ", bathrooms) print("BedRooms----------------------------> : ", bedrooms) print("Tax Assessed Value------------------> : ", tax_assessed_value) print("Zestimate---------------------------> : ", zestimate) print("Rent Zestimate----------------------> : ", rent_zestimate) print("Home Type---------------------------> : ", home_type) print("Parking-----------------------------> : ", parking) print("Year Built--------------------------> : ", year_built) print("HOA---------------------------------> : ", hoa) print("Heating-----------------------------> : ", heating) print("Lot---------------------------------> : ", lot) print("Cooling-----------------------------> : ", cooling) print("Price Sqft--------------------------> : ", price_sqft) string_id = property_address + status_text + phone_number m = hashlib.md5() m.update(string_id.encode('utf8')) identifier = m.hexdigest() print("hash-------------------->", identifier) create_time = str(datetime.now()) update_time = "" insertdb = InsertDB() data_base.append( (property_address, street_add, city, state, zipcode, status_text, phone_number, bathrooms, bedrooms, tax_assessed_value, zestimate, rent_zestimate, home_type, parking, year_built, hoa, heating, lot, cooling, price_sqft, identifier, create_time, update_time)) insertdb.insert_document(data_base, table_name)
def main(htmlstring, driver): currentDT = datetime.datetime.now() table_name = "maricopa" header = { 'accept': '*/*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en-US,en;q=0.9,ko;q=0.8', 'referer': 'https://www.zillow.com/homes/85139_rb/', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' } url = "https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-113.68680328906252%2C%22east%22%3A-110.68753571093752%2C%22south%22%3A31.344189534984903%2C%22north%22%3A35.173746183166216%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A2402%2C%22regionType%22%3A4%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22mapZoom%22%3A8%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%7D" # url = "https://www.zillow.com/homes/85138_rb/" response = requests.get(url, headers=header) result = response.json() properties_infos = result["searchResults"]["mapResults"] print(len(properties_infos)) for i in range(0, len(properties_infos)): data_base = [] property_url = "https://www.zillow.com/" + properties_infos[i][ "detailUrl"] status_text = properties_infos[i]["statusText"] try: street_add = properties_infos[i]["hdpData"]["homeInfo"][ "streetAddress"] except: street_add = "" try: city = properties_infos[i]["hdpData"]["homeInfo"]["city"] except: city = "" try: state = properties_infos[i]["hdpData"]["homeInfo"]["state"] except: state = "" try: zipcode = properties_infos[i]["hdpData"]["homeInfo"]["zipcode"] except: zipcode = "" property_address = street_add + ", " + city + ", " + state + " " + zipcode if "by owner" in status_text: print("--------------------------------------------------> : ", i + 1) print("Property Address--------------------> : ", property_address) print("Property Url------------------------> : ", property_url) print("Property Status---------------------> : ", status_text) driver.get(property_url) time.sleep(10) # phone_number = driver.find_element_by_xpath("//span[@class='listing-field']").text phones = re.findall(r'[(][\d]{3}[)][ ]?[\d]{3}-[\d]{4}', driver.page_source) for phone in range(1, len(phones) + 1): phone_number = phones[phone - 1] print("Owner Phone Number------------------> : ", phone_number) with open("Zillow_Maricopan_AZ_only500.csv", "a", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow([property_address, status_text, phone_number]) string_id = property_address + status_text + phone_number m = hashlib.md5() m.update(string_id.encode('utf8')) identifier = m.hexdigest() print("hash-------------------->", identifier) create_time = str(datetime.datetime.now()) update_time = "" insertdb = InsertDB() data_base.append((property_address, street_add, city, state, zipcode, status_text, phone_number, identifier, create_time, update_time)) insertdb.insert_document(data_base, table_name)
def main(htmlstring, driver): table_name = "maricopa_30_08_2020" header = { 'accept': '*/*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en-US,en;q=0.9,ko;q=0.8', 'cookie': 'zguid=23|%2410ab80e6-80db-4e0a-9f70-2449ca972d74; _ga=GA1.2.759159145.1599348167; zjs_user_id=null; zjs_anonymous_id=%2210ab80e6-80db-4e0a-9f70-2449ca972d74%22; _gcl_au=1.1.607943717.1599348169; _pxvid=be9ff2f0-efce-11ea-9652-0242ac12000b; __gads=ID=cab593cad6cbce43:T=1599348200:S=ALNI_MaFYrYCZZvPIITKUEoEDXGvXSRYwQ; _gid=GA1.2.1287304564.1599556314; _pin_unauth=dWlkPU9EUXdZamxrTldJdE9ESTBNUzAwWXprMExXSXdNekl0TkdWak0yWTFNVEE1TldJeSZycD1abUZzYzJV; ki_r=; ki_s=; _fbp=fb.1.1599562363584.1440832488; g_state={"i_p":1599570378147,"i_l":1}; ki_t=1599556892885%3B1599556892885%3B1599563330503%3B1%3B19; JSESSIONID=62F47C1DAFBF00B3DB7B301BEA3E6586; zgsession=1|8840c1ee-f8a6-43d7-9a7b-3169df33c987; _pxff_cc=U2FtZVNpdGU9TGF4Ow==; _pxff_rf=1; _pxff_fp=1; _pxff_bsco=1; _px3=6d722620cec81d0df86c8eff4b631bdd93cef163fb0a14808e80f81013747454:M7trNae6CpAztMArZT97P3Vy9jFLz9FuEZ5p2efYpXeqOJC7Bw+xzsVGxArAYe+PM+vQKNuEI3qytjutx2UEXg==:1000:M1Vo/kdU1lI8Zqky6jJnuwSu45xHxX8ueCLKUiW6KX8rNR+VWAORLQi+1ns4dhilOU7gSCJfJmToj1SeyKN49kHZQZIQ0wSFeFtn+txzkIo/fhFAr2Cq7WvjCVWw7GBx8F3JIjMqHf1BZAAFg0YXqy/IVuCFhvIioSyK35nkm4A=; _gat=1; KruxPixel=true; DoubleClickSession=true; _uetsid=f44fc66ca5c392a6859170ed776b6ae9; _uetvid=dc708dafb2b6d91ab6c6923ac1ae6673; AWSALB=3gLhoP6QCdmf4zskymQ7ej/kbqzRHNkv+QNQMFmS6Y7S9pENaOusdnQVhFHWm1W9z8/1Og/WmO8JK63ys0wmi6ZNwRc4SN8lf4pcoyrm+nj8lLAPLRDIqMaYAEte; AWSALBCORS=3gLhoP6QCdmf4zskymQ7ej/kbqzRHNkv+QNQMFmS6Y7S9pENaOusdnQVhFHWm1W9z8/1Og/WmO8JK63ys0wmi6ZNwRc4SN8lf4pcoyrm+nj8lLAPLRDIqMaYAEte; search=6|1602203173818%7Crb%3DMaricopa%252C-AZ%26rect%3D33.203401%252C-111.882231%252C32.788612%252C-112.512953%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26pt%3Dpmf%252Cpf%26fs%3D1%26fr%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%09%0932697%09%09%09%09%09%09', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' } pagination = "" usersSearchTerm = "85006" west = "-112.07973577801513" east = "-112.01665022198486" south = "33.43522122804253" north = "33.494937169247144" regionId = "94722" regionType = "7" mapZoom = "14" includeList = "true" # https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={%22pagination%22:{},%22usersSearchTerm%22:%2285006%22,%22mapBounds%22:{%22west%22:-112.07973577801513,%22east%22:-112.01665022198486,%22south%22:33.43522122804253,%22north%22:33.494937169247144},%22regionSelection%22:[{%22regionId%22:94722,%22regionType%22:7}],%22isMapVisible%22:true,%22mapZoom%22:14,%22filterState%22:{%22sort%22:{%22value%22:%22globalrelevanceex%22}},%22isListVisible%22:true}&includeMap=false&includeList=true default_first_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{0},"usersSearchTerm":"{1}","mapBounds":{"west":{2},"east":{3},"south":{4},"north":{5}},"regionSelection":[{"regionId":{6},"regionType":{7}}],"isMapVisible":true,"mapZoom":{8},"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList={9}' first_case_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{' + pagination + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList=' + includeList # first_url = default_first_url.format(pagination, usersSearchTerm, west, east, south, north, regionId, regionType, mapZoom, includeList) print(first_case_url) # return default_page_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":' + pagination + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList=' + includeList counts = 1 for page in range(1, 4): default_page_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":' + str( page ) + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList=' + includeList if page == 1: url = first_case_url else: url = default_page_url response = requests.get(url, headers=header) result = response.json() properties_infos = result["searchResults"]["listResults"] print(len(properties_infos)) for i in range(0, len(properties_infos)): data_base = [] property_url = properties_infos[i]["detailUrl"] status_text = properties_infos[i]["statusText"] print(status_text, counts) counts += 1 try: street_add = properties_infos[i]["hdpData"]["homeInfo"][ "streetAddress"] except: street_add = "" try: city = properties_infos[i]["hdpData"]["homeInfo"]["city"] except: city = "" try: state = properties_infos[i]["hdpData"]["homeInfo"]["state"] except: state = "" try: zipcode = properties_infos[i]["hdpData"]["homeInfo"]["zipcode"] except: zipcode = "" property_address = street_add + ", " + city + ", " + state + " " + zipcode if "by owner" in status_text: print("--------------------------------------------------> : ", i + 1) driver.get(property_url) time.sleep(10) # street_add = driver.find_element_by_xpath("//h1[@class='ds-address-container']/span[1]").text # property_address = street_add + ", " + city + ", " + state + " " + zipcode # phone_number = driver.find_element_by_xpath("//span[@class='listing-field']").text phones = re.findall(r'[(][\d]{3}[)][ ]?[\d]{3}-[\d]{4}', driver.page_source) for phone in range(1, len(phones) + 1): phone_number = phones[phone - 1] print("Property Address--------------------> : ", property_address) print("Property Url------------------------> : ", property_url) print("Property Status---------------------> : ", status_text) print("Owner Phone Number------------------> : ", phone_number) string_id = property_address + status_text + phone_number m = hashlib.md5() m.update(string_id.encode('utf8')) identifier = m.hexdigest() print("hash-------------------->", identifier) create_time = str(datetime.datetime.now()) update_time = "" insertdb = InsertDB() data_base.append((property_address, street_add, city, state, zipcode, status_text, phone_number, identifier, create_time, update_time)) insertdb.insert_document(data_base, table_name)