def parse_details(htmlstring, driver1):
    data_base = []

    try:
        brand_name = driver1.find_element_by_xpath(
            "//h1[contains(@class, 'sc-csuQGl') and contains(@class, 'icyJZl')]"
        ).text
        title = driver1.find_element_by_xpath(
            "//h4[contains(@class, 'sc-gipzik')]").text
        xpath_infos = driver1.find_elements_by_xpath(
            "//span[contains(@class, 'StyledText-sc-1sadyjn-0') and contains(@class, 'bVvIwM')]"
        )

        planet = xpath_infos[0].text
        people = xpath_infos[1].text
        animals = xpath_infos[2].text

        description = driver1.find_element_by_xpath(
            "//div[contains(@class, 'sc-hzDkRC') and contains(@class, 'giRCDN')]"
        ).text

        overall_rating = driver1.find_element_by_xpath(
            "//h2[contains(@class,  'sc-bRBYWo')]").text
        overall_rating = overall_rating.replace("Overall rating: ", "")

        print("Brand-------------> : ", brand_name)
        print("Title-------------> : ", title)
        print("Planet------------> : ", planet)
        print("People------------> : ", people)
        print("Animals-----------> : ", animals)
        print("Overall Rating----> : ", overall_rating)
        print("Description-------> : ", description)

        string_identify = brand_name + title + planet + people + animals + overall_rating
        m = hashlib.md5()
        m.update(string_identify.encode('utf8'))
        identifier = m.hexdigest()

        create_time = str(datetime.datetime.now())
        update_time = ""

        insertdb = InsertDB()
        data_base.append(
            (brand_name, title, planet, people, animals, overall_rating,
             description, identifier, create_time, update_time))

        insertdb.insert_document(data_base, table_name)
    except:
        print("Continue")
def main(htmlstring, driver):
    table_name = "maricopa"

    header = {
        'accept':
        '*/*',
        'accept-encoding':
        'gzip, deflate, br',
        'accept-language':
        'en-US,en;q=0.9,ko;q=0.8',
        'referer':
        'https://www.zillow.com/homes/85139_rb/',
        'sec-fetch-dest':
        'empty',
        'sec-fetch-mode':
        'cors',
        'sec-fetch-site':
        'same-origin',
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
    }

    first_url = "https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%2285233%22%2C%22mapBounds%22%3A%7B%22west%22%3A-111.87922703222657%2C%22east%22%3A-111.75151096777344%2C%22south%22%3A33.286508479539734%2C%22north%22%3A33.40610372761989%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A94827%2C%22regionType%22%3A7%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22mapZoom%22%3A13%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%7D&includeMap=false&includeList=true"

    default_url = "https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=%7B%22pagination%22%3A%7B%22currentPage%22%3A{}%7D%2C%22usersSearchTerm%22%3A%2285233%22%2C%22mapBounds%22%3A%7B%22west%22%3A-111.87922703222657%2C%22east%22%3A-111.75151096777344%2C%22south%22%3A33.286508479539734%2C%22north%22%3A33.40610372761989%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A94827%2C%22regionType%22%3A7%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22mapZoom%22%3A13%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%7D&includeMap=false&includeList=true"

    counts = 1

    for page in range(1, 3):
        if page == 1:
            url = first_url
        else:
            url = default_url.format(page)

        response = requests.get(url, headers=header)
        result = response.json()
        properties_infos = result["searchResults"]["listResults"]
        print(len(properties_infos))

        for i in range(0, len(properties_infos)):
            data_base = []
            property_url = properties_infos[i]["detailUrl"]
            status_text = properties_infos[i]["statusText"]
            print(status_text, counts)
            counts += 1
            try:
                street_add = properties_infos[i]["hdpData"]["homeInfo"][
                    "streetAddress"]
            except:
                street_add = ""

            try:
                city = properties_infos[i]["hdpData"]["homeInfo"]["city"]
            except:
                city = ""

            try:
                state = properties_infos[i]["hdpData"]["homeInfo"]["state"]
            except:
                state = ""

            try:
                zipcode = properties_infos[i]["hdpData"]["homeInfo"]["zipcode"]
            except:
                zipcode = ""

            property_address = street_add + ", " + city + ", " + state + " " + zipcode

            if "by owner" in status_text:
                print("--------------------------------------------------> : ",
                      i + 1)
                print("Property Address--------------------> : ",
                      property_address)
                print("Property Url------------------------> : ", property_url)
                print("Property Status---------------------> : ", status_text)

                driver.get(property_url)
                time.sleep(10)

                # phone_number = driver.find_element_by_xpath("//span[@class='listing-field']").text
                phones = re.findall(r'[(][\d]{3}[)][ ]?[\d]{3}-[\d]{4}',
                                    driver.page_source)
                for phone in range(1, len(phones) + 1):
                    phone_number = phones[phone - 1]
                print("Owner Phone Number------------------> : ", phone_number)

                string_id = property_address + status_text + phone_number
                m = hashlib.md5()
                m.update(string_id.encode('utf8'))
                identifier = m.hexdigest()
                print("hash-------------------->", identifier)
                create_time = str(datetime.datetime.now())
                update_time = ""

                insertdb = InsertDB()
                data_base.append((property_address, street_add, city, state,
                                  zipcode, status_text, phone_number,
                                  identifier, create_time, update_time))
                insertdb.insert_document(data_base, table_name)
示例#3
0
def main(htmlstring, driver):
    table_name = "maricopa_30_08_2020"

    header = {
        'accept':
        '*/*',
        'accept-encoding':
        'gzip, deflate, br',
        'accept-language':
        'en-US,en;q=0.9,ko;q=0.8',
        'cookie':
        'zguid=23|%2410ab80e6-80db-4e0a-9f70-2449ca972d74; _ga=GA1.2.759159145.1599348167; zjs_user_id=null; zjs_anonymous_id=%2210ab80e6-80db-4e0a-9f70-2449ca972d74%22; _gcl_au=1.1.607943717.1599348169; _pxvid=be9ff2f0-efce-11ea-9652-0242ac12000b; __gads=ID=cab593cad6cbce43:T=1599348200:S=ALNI_MaFYrYCZZvPIITKUEoEDXGvXSRYwQ; _gid=GA1.2.1287304564.1599556314; _pin_unauth=dWlkPU9EUXdZamxrTldJdE9ESTBNUzAwWXprMExXSXdNekl0TkdWak0yWTFNVEE1TldJeSZycD1abUZzYzJV; ki_r=; ki_s=; _fbp=fb.1.1599562363584.1440832488; g_state={"i_p":1599570378147,"i_l":1}; ki_t=1599556892885%3B1599556892885%3B1599563330503%3B1%3B19; JSESSIONID=62F47C1DAFBF00B3DB7B301BEA3E6586; zgsession=1|8840c1ee-f8a6-43d7-9a7b-3169df33c987; _pxff_cc=U2FtZVNpdGU9TGF4Ow==; _pxff_rf=1; _pxff_fp=1; _pxff_bsco=1; _px3=6d722620cec81d0df86c8eff4b631bdd93cef163fb0a14808e80f81013747454:M7trNae6CpAztMArZT97P3Vy9jFLz9FuEZ5p2efYpXeqOJC7Bw+xzsVGxArAYe+PM+vQKNuEI3qytjutx2UEXg==:1000:M1Vo/kdU1lI8Zqky6jJnuwSu45xHxX8ueCLKUiW6KX8rNR+VWAORLQi+1ns4dhilOU7gSCJfJmToj1SeyKN49kHZQZIQ0wSFeFtn+txzkIo/fhFAr2Cq7WvjCVWw7GBx8F3JIjMqHf1BZAAFg0YXqy/IVuCFhvIioSyK35nkm4A=; _gat=1; KruxPixel=true; DoubleClickSession=true; _uetsid=f44fc66ca5c392a6859170ed776b6ae9; _uetvid=dc708dafb2b6d91ab6c6923ac1ae6673; AWSALB=3gLhoP6QCdmf4zskymQ7ej/kbqzRHNkv+QNQMFmS6Y7S9pENaOusdnQVhFHWm1W9z8/1Og/WmO8JK63ys0wmi6ZNwRc4SN8lf4pcoyrm+nj8lLAPLRDIqMaYAEte; AWSALBCORS=3gLhoP6QCdmf4zskymQ7ej/kbqzRHNkv+QNQMFmS6Y7S9pENaOusdnQVhFHWm1W9z8/1Og/WmO8JK63ys0wmi6ZNwRc4SN8lf4pcoyrm+nj8lLAPLRDIqMaYAEte; search=6|1602203173818%7Crb%3DMaricopa%252C-AZ%26rect%3D33.203401%252C-111.882231%252C32.788612%252C-112.512953%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26pt%3Dpmf%252Cpf%26fs%3D1%26fr%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%09%0932697%09%09%09%09%09%09',
        'sec-fetch-dest':
        'empty',
        'sec-fetch-mode':
        'cors',
        'sec-fetch-site':
        'same-origin',
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
    }

    first_url = "https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%2285298%22%2C%22mapBounds%22%3A%7B%22west%22%3A-111.79299953222655%2C%22east%22%3A-111.66528346777342%2C%22south%22%3A33.17769548749086%2C%22north%22%3A33.29743987631964%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A399659%2C%22regionType%22%3A7%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22mapZoom%22%3A13%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%7D&includeMap=false&includeList=true"

    default_url = "https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=%7B%22pagination%22%3A%7B%22currentPage%22%3A{}%7D%2C%22usersSearchTerm%22%3A%2285298%22%2C%22mapBounds%22%3A%7B%22west%22%3A-111.79299953222655%2C%22east%22%3A-111.66528346777342%2C%22south%22%3A33.17769548749086%2C%22north%22%3A33.29743987631964%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A399659%2C%22regionType%22%3A7%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22mapZoom%22%3A13%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%7D&includeMap=false&includeList=true"

    counts = 1

    for page in range(1, 5):
        if page == 1:
            url = first_url
        else:
            url = default_url.format(page)

        response = requests.get(url, headers=header)
        result = response.json()
        properties_infos = result["searchResults"]["listResults"]
        print(len(properties_infos))

        for i in range(0, len(properties_infos)):
            data_base = []
            property_url = properties_infos[i]["detailUrl"]
            status_text = properties_infos[i]["statusText"]
            print(status_text, counts)
            counts += 1
            try:
                street_add = properties_infos[i]["hdpData"]["homeInfo"][
                    "streetAddress"]
            except:
                street_add = ""

            try:
                city = properties_infos[i]["hdpData"]["homeInfo"]["city"]
            except:
                city = ""

            try:
                state = properties_infos[i]["hdpData"]["homeInfo"]["state"]
            except:
                state = ""

            try:
                zipcode = properties_infos[i]["hdpData"]["homeInfo"]["zipcode"]
            except:
                zipcode = ""

            property_address = street_add + ", " + city + ", " + state + " " + zipcode

            try:
                bathrooms = properties_infos[i]["hdpData"]["homeInfo"][
                    "bathrooms"]
            except:
                bathrooms = ""

            try:
                bedrooms = properties_infos[i]["hdpData"]["homeInfo"][
                    "bedrooms"]
            except:
                bedrooms = ""

            try:
                tax_assessed_value = properties_infos[i]["hdpData"][
                    "homeInfo"]["taxAssessedValue"]
            except:
                tax_assessed_value = ""

            try:
                zestimate = properties_infos[i]["hdpData"]["homeInfo"][
                    "zestimate"]
            except:
                zestimate = ""

            try:
                rent_zestimate = properties_infos[i]["hdpData"]["homeInfo"][
                    "rentZestimate"]
            except:
                rent_zestimate = ""

            try:
                home_type = properties_infos[i]["hdpData"]["homeInfo"][
                    "homeType"]
            except:
                home_type = ""

            if "by owner" in status_text:
                print("--------------------------------------------------> : ",
                      i + 1)

                driver.get(property_url)
                time.sleep(10)

                try:
                    wait(driver, "//ul[@class='ds-home-fact-list']")
                except:
                    print("There is no xpath")

                # street_add = driver.find_element_by_xpath("//h1[@class='ds-address-container']/span[1]").text
                # property_address = street_add + ", " + city + ", " + state + " " + zipcode

                # phone_number = driver.find_element_by_xpath("//span[@class='listing-field']").text
                phones = re.findall(r'[(][\d]{3}[)][ ]?[\d]{3}-[\d]{4}',
                                    driver.page_source)
                for phone in range(1, len(phones) + 1):
                    phone_number = phones[phone - 1]

                features_labels = driver.find_elements_by_xpath(
                    "//ul[@class='ds-home-fact-list']//span[contains(@class, 'ds-standard-label') and contains(@class, 'ds-home-fact-label')]"
                )
                features_infos = driver.find_elements_by_xpath(
                    "//ul[@class='ds-home-fact-list']//span[contains(@class, 'ds-body') and contains(@class, 'ds-home-fact-value')]"
                )

                parking = ""
                year_built = ""
                hoa = ""
                heating = ""
                lot = ""
                cooling = ""
                price_sqft = ""

                for feature_label, feature_info in zip(features_labels,
                                                       features_infos):
                    feature_label_txt = feature_label.text

                    if 'Parking' in feature_label_txt:
                        parking = feature_info.text
                    elif 'Year built' in feature_label_txt:
                        year_built = feature_info.text
                    elif 'HOA' in feature_label_txt:
                        hoa = feature_info.text
                    elif 'Heating' in feature_label_txt:
                        heating = feature_info.text
                    elif 'Lot' in feature_label_txt:
                        lot = feature_info.text
                    elif 'Cooling' in feature_label_txt:
                        cooling = feature_info.text
                    elif 'Price/' in feature_label_txt:
                        price_sqft = feature_info.text

                print("Property Address--------------------> : ",
                      property_address)
                print("Property Url------------------------> : ", property_url)
                print("Property Status---------------------> : ", status_text)
                print("Owner Phone Number------------------> : ", phone_number)
                print("BathRooms---------------------------> : ", bathrooms)
                print("BedRooms----------------------------> : ", bedrooms)
                print("Tax Assessed Value------------------> : ",
                      tax_assessed_value)
                print("Zestimate---------------------------> : ", zestimate)
                print("Rent Zestimate----------------------> : ",
                      rent_zestimate)
                print("Home Type---------------------------> : ", home_type)
                print("Parking-----------------------------> : ", parking)
                print("Year Built--------------------------> : ", year_built)
                print("HOA---------------------------------> : ", hoa)
                print("Heating-----------------------------> : ", heating)
                print("Lot---------------------------------> : ", lot)
                print("Cooling-----------------------------> : ", cooling)
                print("Price Sqft--------------------------> : ", price_sqft)

                string_id = property_address + status_text + phone_number
                m = hashlib.md5()
                m.update(string_id.encode('utf8'))
                identifier = m.hexdigest()
                print("hash-------------------->", identifier)
                create_time = str(datetime.datetime.now())
                update_time = ""

                insertdb = InsertDB()
                data_base.append(
                    (property_address, street_add, city, state, zipcode,
                     status_text, phone_number, bathrooms, bedrooms,
                     tax_assessed_value, zestimate, rent_zestimate, home_type,
                     parking, year_built, hoa, heating, lot, cooling,
                     price_sqft, identifier, create_time, update_time))
                insertdb.insert_document(data_base, table_name)
示例#4
0
def main(htmlstring, driver):
    table_name = "maricopa"

    header = {
                'accept': '*/*',
                'accept-encoding': 'gzip, deflate, br',
                'accept-language': 'en-US,en;q=0.9,ko;q=0.8',
                'referer': 'https://www.zillow.com/homes/85139_rb/',
                'sec-fetch-dest': 'empty',
                'sec-fetch-mode': 'cors',
                'sec-fetch-site': 'same-origin',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
            }
    
    pagination = ""
    usersSearchTerm = "85018"
    west = "-112.03634614855957"
    east = "-111.92699785144043"
    south = "33.45540259905828"
    north = "33.574765505875604"
    regionId = "94734"
    regionType = "7"
    mapZoom = "13"
    includeList = "true"

    # https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={%22pagination%22:{},%22usersSearchTerm%22:%2285006%22,%22mapBounds%22:{%22west%22:-112.07973577801513,%22east%22:-112.01665022198486,%22south%22:33.43522122804253,%22north%22:33.494937169247144},%22regionSelection%22:[{%22regionId%22:94722,%22regionType%22:7}],%22isMapVisible%22:true,%22mapZoom%22:14,%22filterState%22:{%22sort%22:{%22value%22:%22globalrelevanceex%22}},%22isListVisible%22:true}&includeMap=false&includeList=true

    default_first_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{0},"usersSearchTerm":"{1}","mapBounds":{"west":{2},"east":{3},"south":{4},"north":{5}},"regionSelection":[{"regionId":{6},"regionType":{7}}],"isMapVisible":true,"mapZoom":{8},"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList={9}'


    first_case_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{' + pagination + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList=' + includeList
    

    # first_url = default_first_url.format(pagination, usersSearchTerm, west, east, south, north, regionId, regionType, mapZoom, includeList)
    print(first_case_url)
    # return
    
    default_page_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":' + pagination + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList=' + includeList

    counts = 1

    for page in range(1, 7):

        default_page_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":' + str(page) + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList=' + includeList

        if page == 1:
            url = first_case_url
        else:
            url = default_page_url

        response = requests.get(url, headers=header)
        result = response.json()
        properties_infos = result["searchResults"]["listResults"]
        print(len(properties_infos))

        for i in range(0, len(properties_infos)):
            data_base = []
            property_url = properties_infos[i]["detailUrl"]
            status_text = properties_infos[i]["statusText"]
            print(status_text, counts)
            counts += 1
            try:
                street_add = properties_infos[i]["hdpData"]["homeInfo"]["streetAddress"]
            except:
                street_add = ""
            
            try:
                city = properties_infos[i]["hdpData"]["homeInfo"]["city"]
            except:
                city = ""
            
            try:
                state = properties_infos[i]["hdpData"]["homeInfo"]["state"]
            except:
                state = ""
            
            try:
                zipcode = properties_infos[i]["hdpData"]["homeInfo"]["zipcode"]
            except:
                zipcode = ""
        
            property_address = street_add + ", " + city + ", " + state + " " + zipcode
            
            
            if "by owner" in status_text:
                print("--------------------------------------------------> : ", i + 1)
                


                driver.get(property_url)
                time.sleep(10)

                # street_add = driver.find_element_by_xpath("//h1[@class='ds-address-container']/span[1]").text
                # property_address = street_add + ", " + city + ", " + state + " " + zipcode

                # phone_number = driver.find_element_by_xpath("//span[@class='listing-field']").text
                phones = re.findall(r'[(][\d]{3}[)][ ]?[\d]{3}-[\d]{4}', driver.page_source)
                for phone in range(1, len(phones) + 1):
                    phone_number = phones[phone - 1]
                    
                print("Property Address--------------------> : ", property_address)
                print("Property Url------------------------> : ", property_url)
                print("Property Status---------------------> : ", status_text)    
                print("Owner Phone Number------------------> : ", phone_number)


                string_id = property_address + status_text + phone_number
                m = hashlib.md5()
                m.update(string_id.encode('utf8'))
                identifier = m.hexdigest()
                print("hash-------------------->", identifier)
                create_time = str(datetime.datetime.now())
                update_time = ""

                insertdb = InsertDB()
                data_base.append((property_address, street_add, city, state, zipcode, status_text, phone_number, identifier, create_time, update_time))
                insertdb.insert_document(data_base, table_name)
示例#5
0
def main(htmlstring, driver):
    table_name = "maricopa_30_08_2020"

    header = {
        'accept':
        '*/*',
        'accept-encoding':
        'gzip, deflate, br',
        'accept-language':
        'en-US,en;q=0.9,ko;q=0.8',
        'sec-fetch-dest':
        'empty',
        'sec-fetch-mode':
        'cors',
        'sec-fetch-site':
        'same-origin',
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
    }

    pagination = ""
    usersSearchTerm = "Maricopa County, AZ"
    west = "-112.42886871875002"
    east = "-110.68204254687502"
    south = "33.142087182945815"
    north = "35.173746183166216"
    regionId = "2402"
    regionType = "4"
    mapZoom = "9"
    includeList = "true"
    priceMax = "1000"
    monthlyPay = "3"

    # https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={%22pagination%22:{},%22usersSearchTerm%22:%2285006%22,%22mapBounds%22:{%22west%22:-112.07973577801513,%22east%22:-112.01665022198486,%22south%22:33.43522122804253,%22north%22:33.494937169247144},%22regionSelection%22:[{%22regionId%22:94722,%22regionType%22:7}],%22isMapVisible%22:true,%22mapZoom%22:14,%22filterState%22:{%22sort%22:{%22value%22:%22globalrelevanceex%22}},%22isListVisible%22:true}&includeMap=false&includeList=true

    default_first_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{0},"usersSearchTerm":"{1}","mapBounds":{"west":{2},"east":{3},"south":{4},"north":{5}},"regionSelection":[{"regionId":{6},"regionType":{7}}],"isMapVisible":true,"mapZoom":{8},"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList={9}'

    first_case_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{' + pagination + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"},"price":{"max":' + priceMax + '},"monthlyPayment":{"max":' + monthlyPay + '}},"isListVisible":true}&includeMap=false&includeList=' + includeList

    # first_url = default_first_url.format(pagination, usersSearchTerm, west, east, south, north, regionId, regionType, mapZoom, includeList)
    print(first_case_url)
    # return

    default_page_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":' + pagination + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"},"price":{"max":' + priceMax + '},"monthlyPayment":{"max":' + monthlyPay + '}},"isListVisible":true}&includeMap=false&includeList=' + includeList

    counts = 1

    for page in range(1, 15):

        default_page_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":' + str(
            page
        ) + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"},"price":{"max":' + priceMax + '},"monthlyPayment":{"max":' + monthlyPay + '}},"isListVisible":true}&includeMap=false&includeList=' + includeList

        if page == 1:
            url = first_case_url
        else:
            url = default_page_url

        print(url)
        response = requests.get(url, headers=header)
        result = response.json()
        print(result)

        properties_infos = result["searchResults"]["listResults"]
        print(len(properties_infos))

        for i in range(0, len(properties_infos)):
            data_base = []
            property_url = properties_infos[i]["detailUrl"]
            status_text = properties_infos[i]["statusText"]
            print(status_text, counts)
            counts += 1
            try:
                street_add = properties_infos[i]["hdpData"]["homeInfo"][
                    "streetAddress"]
            except:
                street_add = ""

            try:
                city = properties_infos[i]["hdpData"]["homeInfo"]["city"]
            except:
                city = ""

            try:
                state = properties_infos[i]["hdpData"]["homeInfo"]["state"]
            except:
                state = ""

            try:
                zipcode = properties_infos[i]["hdpData"]["homeInfo"]["zipcode"]
            except:
                zipcode = ""

            property_address = street_add + ", " + city + ", " + state + " " + zipcode

            try:
                bathrooms = properties_infos[i]["hdpData"]["homeInfo"][
                    "bathrooms"]
            except:
                bathrooms = ""

            try:
                bedrooms = properties_infos[i]["hdpData"]["homeInfo"][
                    "bedrooms"]
            except:
                bedrooms = ""

            try:
                tax_assessed_value = properties_infos[i]["hdpData"][
                    "homeInfo"]["taxAssessedValue"]
            except:
                tax_assessed_value = ""

            try:
                zestimate = properties_infos[i]["hdpData"]["homeInfo"][
                    "zestimate"]
            except:
                zestimate = ""

            try:
                rent_zestimate = properties_infos[i]["hdpData"]["homeInfo"][
                    "rentZestimate"]
            except:
                rent_zestimate = ""

            try:
                home_type = properties_infos[i]["hdpData"]["homeInfo"][
                    "homeType"]
            except:
                home_type = ""

            if "by owner" in status_text:
                print("--------------------------------------------------> : ",
                      i + 1)

                driver.get(property_url)
                time.sleep(10)

                # street_add = driver.find_element_by_xpath("//h1[@class='ds-address-container']/span[1]").text
                # property_address = street_add + ", " + city + ", " + state + " " + zipcode

                # phone_number = driver.find_element_by_xpath("//span[@class='listing-field']").text
                phones = re.findall(r'[(][\d]{3}[)][ ]?[\d]{3}-[\d]{4}',
                                    driver.page_source)
                for phone in range(1, len(phones) + 1):
                    phone_number = phones[phone - 1]

                features_labels = driver.find_elements_by_xpath(
                    "//ul[@class='ds-home-fact-list']//span[contains(@class, 'ds-standard-label') and contains(@class, 'ds-home-fact-label')]"
                )
                features_infos = driver.find_elements_by_xpath(
                    "//ul[@class='ds-home-fact-list']//span[contains(@class, 'ds-body') and contains(@class, 'ds-home-fact-value')]"
                )

                for feature_label, feature_info in zip(features_labels,
                                                       features_infos):
                    feature_label_txt = feature_label.text

                    if 'Parking' in feature_label_txt:
                        parking = feature_info.text
                    elif 'Year built' in feature_label_txt:
                        year_built = feature_info.text
                    elif 'HOA' in feature_label_txt:
                        hoa = feature_info.text
                    elif 'Heating' in feature_label_txt:
                        heating = feature_info.text
                    elif 'Lot' in feature_label_txt:
                        lot = feature_info.text
                    elif 'Cooling' in feature_label_txt:
                        cooling = feature_info.text
                    elif 'Price/' in feature_label_txt:
                        price_sqft = feature_info.text

                print("Property Address--------------------> : ",
                      property_address)
                print("Property Url------------------------> : ", property_url)
                print("Property Status---------------------> : ", status_text)
                print("Owner Phone Number------------------> : ", phone_number)
                print("BathRooms---------------------------> : ", bathrooms)
                print("BedRooms----------------------------> : ", bedrooms)
                print("Tax Assessed Value------------------> : ",
                      tax_assessed_value)
                print("Zestimate---------------------------> : ", zestimate)
                print("Rent Zestimate----------------------> : ",
                      rent_zestimate)
                print("Home Type---------------------------> : ", home_type)
                print("Parking-----------------------------> : ", parking)
                print("Year Built--------------------------> : ", year_built)
                print("HOA---------------------------------> : ", hoa)
                print("Heating-----------------------------> : ", heating)
                print("Lot---------------------------------> : ", lot)
                print("Cooling-----------------------------> : ", cooling)
                print("Price Sqft--------------------------> : ", price_sqft)

                string_id = property_address + status_text + phone_number
                m = hashlib.md5()
                m.update(string_id.encode('utf8'))
                identifier = m.hexdigest()
                print("hash-------------------->", identifier)
                create_time = str(datetime.datetime.now())
                update_time = ""

                insertdb = InsertDB()
                data_base.append(
                    (property_address, street_add, city, state, zipcode,
                     status_text, phone_number, bathrooms, bedrooms,
                     tax_assessed_value, zestimate, rent_zestimate, home_type,
                     parking, year_built, hoa, heating, lot, cooling,
                     price_sqft, identifier, create_time, update_time))
                insertdb.insert_document(data_base, table_name)
def monitor_jobs_crawler(d, d1):

    global title_val, all_loops, refresh_status
    nowtime = datetime.now()
    nowtimeUTC = datetime.utcnow()

    nowtimestr = nowtime.strftime("%Y-%m-%d %H:%M:%S")
    nowtimeUTCstr = nowtimeUTC.strftime("%Y-%m-%d %H:%M:%S")

    print("NowTime---------------   -------> : ", nowtimestr)
    print("NowUTCTime-------------------> : ", nowtimeUTCstr)

    counts = 1
    moreBtn_click_accounts = 1

    result = query_jobs(d)
    if result is None:
        print('Error. Result is None')
        return

    resultSetTs = result["paging"]["resultSetTs"]
    print("Next Paging ID--------------------->", resultSetTs)

    while True:

        if refresh_status != 0 and counts % 10 == 1:
            result = query_jobs_second(d, resultSetTs, moreBtn_click_accounts)
            if result is None:
                print("Error, Result is None")
                return
            moreBtn_click_accounts = moreBtn_click_accounts + 1

        for job in result["results"]:

            print(
                "------------------------------------------------------------->",
                counts)

            if counts >= 50:
                return

            nowtime = datetime.now()
            title = job["title"]

            if counts == 1:
                if title_val == title:
                    return
                first_title = title
                if all_loops == 1:
                    title_val = first_title
                    print("Store the first project Title: ", title_val)

            # if all_loops == 1:

            #     if counts == 0:
            #         title_val = title
            #         print("Store the first project Title: ", title_val)

            if counts != 1:
                if title_val == title:
                    print(title_val, title,
                          "---------------------> title is the same")
                    title_val = first_title

                    return

            print("First Job title------------------> ", first_title)
            print("Val Job title--------------------> ", title_val)

            createdOn = job["createdOn"]
            type = job["type"]
            ciphertext = job["ciphertext"]
            # description     = job["description"]
            # category2       = job["category2"]
            # subcategory2    = job["subcategory2"]
            # skills          = job["skills"]
            duration = job["duration"]
            shortDuration = job["shortDuration"]
            engagement = job["engagement"]
            shortEngagement = job["shortEngagement"]
            amount = int(job["amount"]["amount"])
            recno = job["recno"]
            uid = job["uid"]
            client_paymentverification = job["client"][
                "paymentVerificationStatus"]
            cient_country = job["client"]["location"]["country"]
            totalSpent = job["client"]["totalSpent"]
            totalReviews = job["client"]["totalReviews"]
            totalFeedback = job["client"]["totalFeedback"]
            lastContractPlatform = job["client"]["lastContractPlatform"]
            lastContractRid = job["client"]["lastContractRid"]
            lastContractTitle = job["client"]["lastContractTitle"]
            feedbackText = job["client"]["feedbackText"]
            try:
                feedbackText = (feedbackText.split(","))[0]
            except:
                feedbackText = ""
            companyOrgUid = job["client"]["companyOrgUid"]

            freelancersToHire = job["freelancersToHire"]
            enterpriseJob = job["enterpriseJob"]
            tierText = job["tierText"]
            tier = job["tier"]
            tierLabel = job["tierLabel"]
            propoaslTier = job["proposalsTier"]

            prefFreelancerLocation = job["prefFreelancerLocation"]
            if len(prefFreelancerLocation) == 0:
                prefFreelancerLocationText = ""
            else:
                for freelancerLocation in prefFreelancerLocation:
                    prefFreelancerLocationText = freelancerLocation + ", "

                prefFreelancerLocationText = prefFreelancerLocationText[:-2]

            publishedOn = job["publishedOn"]
            skills_1 = job["attrs"]
            skillStr = ""

            for skill in skills_1:
                skillStr = skillStr + skill["prettyName"] + ", "
            limit = len(skillStr) - 2
            skillStr = skillStr[:limit]

            isLocal = job["isLocal"]
            locations = job["locations"]
            hourlyBudgetText = job["hourlyBudgetText"]

            del d1.requests
            second_query_url = "https://www.upwork.com/jobs/{}".format(
                ciphertext)
            d1.get(second_query_url)
            time.sleep(10)
            #second_response = requests.get(second_query_url, headers=headers, timeout=10)
            #src = second_response.text
            src = d1.page_source
            soup = BeautifulSoup(src, 'lxml')

            #print("StatusCode-------------------->", response.status_code)

            job_posting_status = soup.findAll(
                "li", {"data-qa": "client-job-posting-stats"})

            job_posted_accounts = 0
            job_hire_rate = 0
            if job_posting_status:
                job_posted_accounts = job_posting_status[0].find(
                    "strong", {
                        "class": "primary"
                    }).getText()
                job_posted_accounts = ''.join(
                    job_posted_accounts).strip() if job_posted_accounts else ""
                job_posted_accounts = int((job_posted_accounts.split(" "))[0])

                job_hire_rate = job_posting_status[0].find(
                    "div", {
                        "class": "text-muted"
                    }).getText()
                job_hire_rate = ''.join(
                    job_hire_rate).strip() if job_hire_rate else ""
                job_hire_rate = int(
                    (((job_hire_rate.split(","))[0]).split(" "))[0].replace(
                        "%", ""))

            try:
                avg_hourly_rate = soup.findAll(
                    "strong", {"data-qa": "client-hourly-rate"})[0].getText()
                avg_hourly_rate = ''.join(
                    avg_hourly_rate).strip() if avg_hourly_rate else ""
                avg_hourly_rate = float(
                    ((avg_hourly_rate.split(" "))[0].replace("/hr",
                                                             "")).replace(
                                                                 "$", ""))
            except:
                avg_hourly_rate = str("New Client")

            try:
                hire_accounts = soup.findAll(
                    "div", {"data-qa": "client-hires"})[0].getText()

                hire_accounts = ''.join(
                    hire_accounts).strip() if hire_accounts else ""
                print("Hire_Accounts---------------------!!!!!!--> : ",
                      hire_accounts)
                hire_accounts = hire_accounts.split("\n")[0]
                print("Hire_Accounts---------------------!!!!!!--> : ",
                      hire_accounts)
            except:
                hire_accounts = 0

            try:
                client_spent_hours = soup.findAll(
                    "div", {"data-qa": "client-hours"})[0].getText()
                client_spent_hours = ''.join(
                    client_spent_hours).strip() if client_spent_hours else ""
                client_spent_hours = int((client_spent_hours.split(" "))[0])
            except:
                client_spent_hours = 0

            try:
                client_created_date = soup.findAll(
                    "li", {"data-qa": "client-contract-date"})[0].find(
                        "small", {
                            "class": "text-muted"
                        }).getText()
                client_created_date = ''.join(
                    client_created_date).strip() if client_created_date else ""
                client_created_date = client_created_date.replace(
                    "Member since ", "")
            except:
                client_created_date = "Private job"

            print("Title--------------------> :", title)
            print("Job URL------------------> :", second_query_url)
            # print("ProjectCreated Date------> :", createdOn)
            # print("Job Type-----------------> :", type)
            # print("JobDetailsUrlid----------> :", ciphertext)
            # print("Description--------------> :", description)
            # print("Category2----------------> :", category2)
            # print("SubCategory2-------------> :", subcategory2)
            # print("Skills-------------------> :", skills)
            print("Duration-----------------> :", duration)
            print("Engagement---------------> :", engagement)
            print("ShortDuration------------> :", shortDuration)
            print("ShortEngagement----------> :", shortEngagement)
            print("Amount-------------------> :", amount)
            # print("RecNo--------------------> :", recno)
            # print("Uid----------------------> :", uid)
            print("ClientPaymentVerifcation-> :", client_paymentverification)
            print("ClientCountry------------> :", cient_country)
            print("TotalSpent---------------> :", totalSpent)
            print("TotalReviews-------------> :", totalReviews)
            print("TotalFeedback------------> :", totalFeedback)
            # print("LastContractPlatform-----> :", lastContractPlatform)
            # print("LastContractRid----------> :", lastContractRid)
            # print("lastContractTitle--------> :", lastContractTitle)
            print("FeedbackText-------------> :", feedbackText)
            # print("CompanyOrgUid------------> :", companyOrgUid)
            # print("FreelancersToHire--------> :", freelancersToHire)
            # print("EnterpriseJob------------> :", enterpriseJob)
            print("TierText-----------------> :", tierText)
            # print("Tier---------------------> :", tier)
            # print("Tierlabel----------------> :", tierLabel)
            # print("PropoaslTier-------------> :", propoaslTier)
            print("PrefFreelancerLocation---> :", prefFreelancerLocationText)

            # print("PublishedOn--------------> :", publishedOn)
            # print("SkillStr-----------------> :", skillStr)
            # print("IsLocal------------------> :", isLocal)
            # print("Locations----------------> :", locations)
            # print("HourlyBudgetText---------> :", hourlyBudgetText)

            print("Job Posted Accounts------> :", job_posted_accounts)
            print("Job Hired Accounts-------> :", hire_accounts)
            print("Job Hire Rate------------> :", job_hire_rate)
            print("Average Hourly Rate------> :", avg_hourly_rate)
            print("Member Since-------------> :", client_created_date)

            insertdb = InsertDB()
            create_time = str(nowtime)
            create_time_time = (create_time.split(" "))[1]
            create_time_time = create_time_time[0:4]
            data_base = []

            level_status = False
            weekly_hours_status = False
            period_status = False

            for level in level_array:
                if level in tierText:
                    level_status = True
            try:
                for weekly_hour in weekly_hours_array:
                    print(weekly_hour, shortEngagement)
                    if weekly_hour in shortEngagement:
                        weekly_hours_status = True
                        print("weekly hours----------------------> True!!!!")
                        break
            except:
                weekly_hours_status = False

            try:
                for period in period_array:
                    if period in shortDuration:
                        period_status = True
            except:
                period_status = False

            print("Level Status------------------->", level_status)
            if avg_hourly_rate != "New Client":
                avg_hourly_rate_text = str(avg_hourly_rate) + "$/hr"
            else:
                avg_hourly_rate_text = " "
            job_hire_rate_text = str(job_hire_rate) + "% hire rate"

            if avg_hourly_rate != "New Client":
                print("This client isn't new person on upwork")
                if avg_hourly_rate >= average_rate:
                    print("평균 지불 hourly rate가 40 이상인 경우:")
                    if ("Intermediate" in tierText or "Expert" in tierText):
                        print("intermediate, expert 레벨의 과제는 과제 기간에 상관없이 현시한다")
                        if amount == 0:
                            budget = "Hourly"
                        else:
                            budget = amount
                        data_base.append(
                            (title, second_query_url, budget, duration,
                             shortEngagement, client_paymentverification,
                             cient_country, totalSpent, feedbackText,
                             prefFreelancerLocationText, tierText[0:6],
                             job_posted_accounts, hire_accounts,
                             avg_hourly_rate_text, client_created_date,
                             create_time_time))
                        insertdb.insert_document(data_base)

                elif avg_hourly_rate < average_rate:
                    print("과거의 평균지불레이트 40미만이면 다음의 조건에 맞는 과제들만 현시한다.")
                    if (level_status and weekly_hours_status
                            and period_status) or amount >= fixed_amount:
                        print(
                            "Expert, 10 or 30hours more, 1 month more or 10k more"
                        )
                        if amount == 0:
                            budget = "Hourly"
                        else:
                            budget = amount
                        data_base.append(
                            (title, second_query_url, budget, duration,
                             shortEngagement, client_paymentverification,
                             cient_country, totalSpent, feedbackText,
                             prefFreelancerLocationText, tierText[0:6],
                             job_posted_accounts, hire_accounts,
                             avg_hourly_rate_text, client_created_date,
                             create_time_time))
                        insertdb.insert_document(data_base)

            else:
                print(level_status, weekly_hours_status, period_status)
                if (level_status and weekly_hours_status
                        and period_status) or amount >= fixed_amount:
                    print(
                        "Expert, 10 or 30hours more, 1 month more or 10k more")
                    if amount == 0:
                        budget = "Hourly"
                    else:
                        budget = amount
                    data_base.append(
                        (title, second_query_url, budget, duration,
                         shortEngagement, client_paymentverification,
                         cient_country, totalSpent, feedbackText,
                         prefFreelancerLocationText, tierText[0:6],
                         job_posted_accounts, hire_accounts,
                         avg_hourly_rate_text, client_created_date,
                         create_time_time))
                    insertdb.insert_document(data_base)

            counts += 1

        if all_loops == 1:
            break

        refresh_status = refresh_status + 1
示例#7
0
def main(htmlstring, driver):
    table_name = "maricopa"

    header = {
        'accept':
        '*/*',
        'accept-encoding':
        'gzip, deflate, br',
        'accept-language':
        'en-US,en;q=0.9,ko;q=0.8',
        'referer':
        'https://www.zillow.com/homes/85139_rb/',
        'sec-fetch-dest':
        'empty',
        'sec-fetch-mode':
        'cors',
        'sec-fetch-site':
        'same-origin',
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
    }

    first_url = "https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%2285004%22%2C%22mapBounds%22%3A%7B%22west%22%3A-112.10311127801512%2C%22east%22%3A-112.04002572198485%2C%22south%22%3A33.42091247402758%2C%22north%22%3A33.48063826771274%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A94720%2C%22regionType%22%3A7%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A14%7D&wants={%22cat1%22:[%22mapResults%22,%22total%22]}&requestId=2"

    default_url = "https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=%7B%22pagination%22%3A%7B%22currentPage%22%3A2%7D%2C%22usersSearchTerm%22%3A%2285006%22%2C%22mapBounds%22%3A%7B%22west%22%3A-112.07973577801513%2C%22east%22%3A-112.01665022198486%2C%22south%22%3A33.43522122804251%2C%22north%22%3A33.494937169247095%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A94722%2C%22regionType%22%3A7%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A14%7D&wants={%22cat1%22:[%22listResults%22,%22mapResults%22,%22total%22]}&requestId=3"

    counts = 1

    for page in range(1, 4):
        if page == 1:
            url = first_url
        else:
            url = default_url.format(page)

        response = requests.get(url, headers=header)
        result = response.json()
        properties_infos = result["cat1"]["searchResults"]["mapResults"]
        print(len(properties_infos))

        for i in range(0, len(properties_infos)):
            data_base = []
            property_url = "https://www.zillow.com" + properties_infos[i][
                "detailUrl"]
            status_text = properties_infos[i]["statusText"]
            print(status_text, counts)
            counts += 1
            try:
                street_add = properties_infos[i]["hdpData"]["homeInfo"][
                    "streetAddress"]
            except:
                street_add = ""

            try:
                city = properties_infos[i]["hdpData"]["homeInfo"]["city"]
            except:
                city = ""

            try:
                state = properties_infos[i]["hdpData"]["homeInfo"]["state"]
            except:
                state = ""

            try:
                zipcode = properties_infos[i]["hdpData"]["homeInfo"]["zipcode"]
            except:
                zipcode = ""

            # property_address = street_add + ", " + city + ", " + state + " " + zipcode

            if "by owner" in status_text:
                print("--------------------------------------------------> : ",
                      i + 1)

                driver.get(property_url)
                time.sleep(10)

                street_add = driver.find_element_by_xpath(
                    "//h1[@class='ds-address-container']/span[1]").text
                property_address = street_add + ", " + city + ", " + state + " " + zipcode
                # phone_number = driver.find_element_by_xpath("//span[@class='listing-field']").text
                phones = re.findall(r'[(][\d]{3}[)][ ]?[\d]{3}-[\d]{4}',
                                    driver.page_source)
                for phone in range(1, len(phones) + 1):
                    phone_number = phones[phone - 1]

                print("Property Address--------------------> : ",
                      property_address)
                print("Property Url------------------------> : ", property_url)
                print("Property Status---------------------> : ", status_text)
                print("Owner Phone Number------------------> : ", phone_number)

                string_id = property_address + status_text + phone_number
                m = hashlib.md5()
                m.update(string_id.encode('utf8'))
                identifier = m.hexdigest()
                print("hash-------------------->", identifier)
                create_time = str(datetime.datetime.now())
                update_time = ""

                insertdb = InsertDB()
                data_base.append((property_address, street_add, city, state,
                                  zipcode, status_text, phone_number,
                                  identifier, create_time, update_time))
                insertdb.insert_document(data_base, table_name)
示例#8
0
def parse_google(htmlstring, driver, f):
    table_name = "ncaa"
    currentDT = datetime.datetime.now()

    print("///---------Google Search--------------///")
    with open('ncaa_flashscore.json') as json_file:
        items = json.load(json_file)
        counts = len(items)
        count = 1

        for item in items:
            data_base = []
            event_time = item['event-time']
            home_team = item['home-name']
            home_score = item['home-score']
            away_team = item['away-name']
            away_score = item['away-score']

            print(home_team, " v ", away_team, "-------------", count)
            search_key = home_team + " v " + away_team

            search_google = driver.find_element_by_xpath(
                "//input[contains(@class, 'gLFyf') and contains(@class, 'gsfi')]"
            )

            search_google.send_keys(search_key)
            search_google.send_keys(Keys.ENTER)
            time.sleep(5)

            try:
                teamNames = driver.find_elements_by_xpath(
                    "//div[contains(@class, 'liveresults-sports-immersive__team-name-width')]"
                )

                fstName = teamNames[0].text
                sndName = teamNames[1].text

                fstScore = driver.find_element_by_xpath(
                    "//div[contains(@class, 'imso_mh__l-tm-sc')]").text
                sndScore = driver.find_element_by_xpath(
                    "//div[contains(@class, 'imso_mh__r-tm-sc')]").text

                print(fstName, "<----->", fstScore)
                print(sndName, "<----->", sndScore)

                if (home_team in fstName and home_score == fstScore) and (
                        away_team in sndName and away_score == sndScore):
                    googleMatch = "True"
                elif (away_team in fstName and away_score == fstScore) and (
                        home_team in sndName and home_score == sndScore):
                    googleMatch = "True"
                else:
                    googleMatch = "False"
            except:
                try:
                    teamNames = driver.find_elements_by_xpath(
                        "//td[contains(@class, 'liveresults-sports-immersive__match-grid-right-border')]//div[contains(@class, 'ellipsisize') and contains(@class, 'kno-fb-ctx')]/span"
                    )

                    fstName = teamNames[0].text
                    sndName = teamNames[1].text

                    teamScores = driver.find_elements_by_xpath(
                        "//td[contains(@class, 'liveresults-sports-immersive__match-grid-right-border')]//div[@class='imspo_mt__tt-w']"
                    )

                    fstScore = teamScores[0].text
                    sndScore = teamScores[1].text

                    print(fstName, "<----->", fstScore)
                    print(sndName, "<----->", sndScore)

                    if (fstName in home_team and home_score == fstScore) and (
                            sndName in away_team and away_score == sndScore):
                        googleMatch = "True"
                    elif (fstName in away_team and away_score
                          == fstScore) and (sndName in home_team
                                            and home_score == sndScore):
                        googleMatch = "True"
                    else:
                        googleMatch = "False"
                except:
                    googleMatch = "False"

            try:
                gameStatus = driver.find_element_by_xpath(
                    "//span[contains(@class, 'imso_mh__ft-mtch') and contains(@class, 'imso-medium-font')]"
                ).text
            except:
                gameStatus = "Not"

            if "final" in gameStatus.lower():
                game_status = "Final"
            else:
                game_status = "Future"

            create_time = str(datetime.datetime.now())
            update_time = ""
            print("---------------------------?????",
                  event_time + home_team + away_team)

            string_id = event_time + home_team + away_team
            m = hashlib.md5()
            m.update(string_id.encode('utf8'))
            identifier = m.hexdigest()
            print("hash-------------------->", identifier)

            insertdb = InsertDB()
            data_base.append((event_time, home_team, home_score, away_team,
                              away_score, googleMatch, game_status, identifier,
                              create_time, update_time))

            insertdb.insert_document(data_base, table_name)

            info = {
                "event-time": event_time,
                "home-name": home_team,
                "home-score": home_score,
                "away-name": away_team,
                "away-score": away_score,
                "google-matching": googleMatch,
                "game-status": game_status,
                "indentifier": identifier,
                "create-time": create_time,
                "update-time": update_time
            }

            json.dump(info, f)
            if count != counts:
                f.write(',\n')

            search_google1 = driver.find_element_by_xpath(
                "//input[contains(@class, 'gLFyf') and contains(@class, 'gsfi')]"
            )
            search_google1.clear()

            count += 1
            # return
    driver.close()
    driver.quit()
示例#9
0
def main(htmlstring, driver):

    table_name = "maricopa_30_08_2020"
    solver = CaptchaSolver()

    header = {
        'accept':
        '*/*',
        'accept-encoding':
        'gzip, deflate, br',
        'accept-language':
        'en-US,en;q=0.9,ko;q=0.8',
        'cookie':
        'zguid=23|%2410ab80e6-80db-4e0a-9f70-2449ca972d74; _ga=GA1.2.759159145.1599348167; zjs_user_id=null; zjs_anonymous_id=%2210ab80e6-80db-4e0a-9f70-2449ca972d74%22; _gcl_au=1.1.607943717.1599348169; _pxvid=be9ff2f0-efce-11ea-9652-0242ac12000b; __gads=ID=cab593cad6cbce43:T=1599348200:S=ALNI_MaFYrYCZZvPIITKUEoEDXGvXSRYwQ; _gid=GA1.2.1287304564.1599556314; _pin_unauth=dWlkPU9EUXdZamxrTldJdE9ESTBNUzAwWXprMExXSXdNekl0TkdWak0yWTFNVEE1TldJeSZycD1abUZzYzJV; ki_r=; ki_s=; _fbp=fb.1.1599562363584.1440832488; g_state={"i_p":1599570378147,"i_l":1}; ki_t=1599556892885%3B1599556892885%3B1599563330503%3B1%3B19; JSESSIONID=62F47C1DAFBF00B3DB7B301BEA3E6586; zgsession=1|8840c1ee-f8a6-43d7-9a7b-3169df33c987; _pxff_cc=U2FtZVNpdGU9TGF4Ow==; _pxff_rf=1; _pxff_fp=1; _pxff_bsco=1; _px3=6d722620cec81d0df86c8eff4b631bdd93cef163fb0a14808e80f81013747454:M7trNae6CpAztMArZT97P3Vy9jFLz9FuEZ5p2efYpXeqOJC7Bw+xzsVGxArAYe+PM+vQKNuEI3qytjutx2UEXg==:1000:M1Vo/kdU1lI8Zqky6jJnuwSu45xHxX8ueCLKUiW6KX8rNR+VWAORLQi+1ns4dhilOU7gSCJfJmToj1SeyKN49kHZQZIQ0wSFeFtn+txzkIo/fhFAr2Cq7WvjCVWw7GBx8F3JIjMqHf1BZAAFg0YXqy/IVuCFhvIioSyK35nkm4A=; _gat=1; KruxPixel=true; DoubleClickSession=true; _uetsid=f44fc66ca5c392a6859170ed776b6ae9; _uetvid=dc708dafb2b6d91ab6c6923ac1ae6673; AWSALB=3gLhoP6QCdmf4zskymQ7ej/kbqzRHNkv+QNQMFmS6Y7S9pENaOusdnQVhFHWm1W9z8/1Og/WmO8JK63ys0wmi6ZNwRc4SN8lf4pcoyrm+nj8lLAPLRDIqMaYAEte; AWSALBCORS=3gLhoP6QCdmf4zskymQ7ej/kbqzRHNkv+QNQMFmS6Y7S9pENaOusdnQVhFHWm1W9z8/1Og/WmO8JK63ys0wmi6ZNwRc4SN8lf4pcoyrm+nj8lLAPLRDIqMaYAEte; search=6|1602203173818%7Crb%3DMaricopa%252C-AZ%26rect%3D33.203401%252C-111.882231%252C32.788612%252C-112.512953%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26pt%3Dpmf%252Cpf%26fs%3D1%26fr%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%09%0932697%09%09%09%09%09%09',
        'sec-fetch-dest':
        'empty',
        'sec-fetch-mode':
        'cors',
        'sec-fetch-site':
        'same-origin',
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
    }

    pagination = ""
    usersSearchTerm = "Maricopa County, AZ"
    west = "-114.00266022265627"
    east = "-110.50900787890627"
    south = "32.012669442967976"
    north = "35.813297084142235"
    regionId = "2402"
    regionType = "4"
    mapZoom = "8"
    includeList = "true"
    priceMax = "220000"
    priceMin = "210000"
    monthlyPayMax = "784"
    monthlyPayMin = "648"

    # https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={%22pagination%22:{},%22usersSearchTerm%22:%2285006%22,%22mapBounds%22:{%22west%22:-112.07973577801513,%22east%22:-112.01665022198486,%22south%22:33.43522122804253,%22north%22:33.494937169247144},%22regionSelection%22:[{%22regionId%22:94722,%22regionType%22:7}],%22isMapVisible%22:true,%22mapZoom%22:14,%22filterState%22:{%22sort%22:{%22value%22:%22globalrelevanceex%22}},%22isListVisible%22:true}&includeMap=false&includeList=true

    default_first_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{0},"usersSearchTerm":"{1}","mapBounds":{"west":{2},"east":{3},"south":{4},"north":{5}},"regionSelection":[{"regionId":{6},"regionType":{7}}],"isMapVisible":true,"mapZoom":{8},"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList={9}'

    first_case_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{' + pagination + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"},"price":{"max":' + priceMax + ',"min":' + priceMin + '},"monthlyPayment":{"max":' + monthlyPayMax + ',"min":' + monthlyPayMin + '}},"isListVisible":true}&includeMap=false&includeList=' + includeList

    # first_url = default_first_url.format(pagination, usersSearchTerm, west, east, south, north, regionId, regionType, mapZoom, includeList)
    # print(first_case_url)
    # return

    default_page_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":' + pagination + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"},"price":{"max":' + priceMax + ',"min":' + priceMin + '},"monthlyPayment":{"max":' + monthlyPayMax + ',"min":' + monthlyPayMin + '}},"isListVisible":true}&includeMap=false&includeList=' + includeList

    counts = 1

    for page in range(1, 9):

        default_page_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":' + str(
            page
        ) + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"},"price":{"max":' + priceMax + ',"min":' + priceMin + '},"monthlyPayment":{"max":' + monthlyPayMax + ',"min":' + monthlyPayMin + '}},"isListVisible":true}&includeMap=false&includeList=' + includeList

        if page == 1:
            url = first_case_url
        else:
            url = default_page_url

        response = requests.get(url, headers=header)
        result = response.json()
        properties_infos = result["searchResults"]["listResults"]
        print(len(properties_infos))

        for i in range(0, len(properties_infos)):
            data_base = []
            property_url = properties_infos[i]["detailUrl"]
            status_text = properties_infos[i]["statusText"]
            print(status_text, counts)
            counts += 1
            try:
                street_add = properties_infos[i]["hdpData"]["homeInfo"][
                    "streetAddress"]
            except:
                street_add = ""

            try:
                city = properties_infos[i]["hdpData"]["homeInfo"]["city"]
            except:
                city = ""

            try:
                state = properties_infos[i]["hdpData"]["homeInfo"]["state"]
            except:
                state = ""

            try:
                zipcode = properties_infos[i]["hdpData"]["homeInfo"]["zipcode"]
            except:
                zipcode = ""

            property_address = street_add + ", " + city + ", " + state + " " + zipcode

            try:
                bathrooms = properties_infos[i]["hdpData"]["homeInfo"][
                    "bathrooms"]
            except:
                bathrooms = ""

            try:
                bedrooms = properties_infos[i]["hdpData"]["homeInfo"][
                    "bedrooms"]
            except:
                bedrooms = ""

            try:
                tax_assessed_value = properties_infos[i]["hdpData"][
                    "homeInfo"]["taxAssessedValue"]
            except:
                tax_assessed_value = ""

            try:
                zestimate = properties_infos[i]["hdpData"]["homeInfo"][
                    "zestimate"]
            except:
                zestimate = ""

            try:
                rent_zestimate = properties_infos[i]["hdpData"]["homeInfo"][
                    "rentZestimate"]
            except:
                rent_zestimate = ""

            try:
                home_type = properties_infos[i]["hdpData"]["homeInfo"][
                    "homeType"]
            except:
                home_type = ""

            if "by owner" in status_text:
                print("--------------------------------------------------> : ",
                      i + 1)

                driver.get(property_url)
                time.sleep(10)

                try:
                    recaptcha = driver.find_element_by_class_name(
                        "g-recaptcha")
                    recaptchaFlag = True
                except:
                    recaptchaFlag = False
                if recaptchaFlag == True:
                    time.sleep(20)
                #     solver = recaptchaV2Proxyless()
                #     solver.set_verbose(1)
                #     solver.set_key('0193c75c69d14245ca25bd5b2217637f')
                #     solver.set_website_url(driver.current_url)
                #     site_key = driver.find_element_by_xpath('//*[@data-sitekey]').get_attribute('data-sitekey')
                #     solver.set_website_key(site_key)

                #     g_response = solver.solve_and_return_solution()
                #     if g_response != 0:
                #         print("g-response : ", g_response)
                #     else:
                #         print("task finished with error : ", solver.error_code)

                #     # time.sleep(5)
                #     driver.find_element_by_class_name("recaptcha-checkbox-border").click

                #     driver.execute_script('document.getElementById("g-recaptcha-response").innerHTML = "{}";'.format(g_response))
                #     time.sleep(5)
                #     driver.find_element_by_id('recaptcha-verify-button').click()

                # street_add = driver.find_element_by_xpath("//h1[@class='ds-address-container']/span[1]").text
                # property_address = street_add + ", " + city + ", " + state + " " + zipcode

                # phone_number = driver.find_element_by_xpath("//span[@class='listing-field']").text
                phones = re.findall(r'[(][\d]{3}[)][ ]?[\d]{3}-[\d]{4}',
                                    driver.page_source)
                for phone in range(1, len(phones) + 1):
                    phone_number = phones[phone - 1]

                features_labels = driver.find_elements_by_xpath(
                    "//ul[@class='ds-home-fact-list']//span[contains(@class, 'ds-standard-label') and contains(@class, 'ds-home-fact-label')]"
                )
                features_infos = driver.find_elements_by_xpath(
                    "//ul[@class='ds-home-fact-list']//span[contains(@class, 'ds-body') and contains(@class, 'ds-home-fact-value')]"
                )

                parking = ""
                year_built = ""
                hoa = ""
                heating = ""
                lot = ""
                cooling = ""
                price_sqft = ""

                for feature_label, feature_info in zip(features_labels,
                                                       features_infos):
                    feature_label_txt = feature_label.text

                    if 'Parking' in feature_label_txt:
                        parking = feature_info.text
                    elif 'Year built' in feature_label_txt:
                        year_built = feature_info.text
                    elif 'HOA' in feature_label_txt:
                        hoa = feature_info.text
                    elif 'Heating' in feature_label_txt:
                        heating = feature_info.text
                    elif 'Lot' in feature_label_txt:
                        lot = feature_info.text
                    elif 'Cooling' in feature_label_txt:
                        cooling = feature_info.text
                    elif 'Price/' in feature_label_txt:
                        price_sqft = feature_info.text

                print("Property Address--------------------> : ",
                      property_address)
                print("Property Url------------------------> : ", property_url)
                print("Property Status---------------------> : ", status_text)
                print("Owner Phone Number------------------> : ", phone_number)
                print("BathRooms---------------------------> : ", bathrooms)
                print("BedRooms----------------------------> : ", bedrooms)
                print("Tax Assessed Value------------------> : ",
                      tax_assessed_value)
                print("Zestimate---------------------------> : ", zestimate)
                print("Rent Zestimate----------------------> : ",
                      rent_zestimate)
                print("Home Type---------------------------> : ", home_type)
                print("Parking-----------------------------> : ", parking)
                print("Year Built--------------------------> : ", year_built)
                print("HOA---------------------------------> : ", hoa)
                print("Heating-----------------------------> : ", heating)
                print("Lot---------------------------------> : ", lot)
                print("Cooling-----------------------------> : ", cooling)
                print("Price Sqft--------------------------> : ", price_sqft)

                string_id = property_address + status_text + phone_number
                m = hashlib.md5()
                m.update(string_id.encode('utf8'))
                identifier = m.hexdigest()
                print("hash-------------------->", identifier)
                create_time = str(datetime.now())
                update_time = ""

                insertdb = InsertDB()
                data_base.append(
                    (property_address, street_add, city, state, zipcode,
                     status_text, phone_number, bathrooms, bedrooms,
                     tax_assessed_value, zestimate, rent_zestimate, home_type,
                     parking, year_built, hoa, heating, lot, cooling,
                     price_sqft, identifier, create_time, update_time))
                insertdb.insert_document(data_base, table_name)
def main(htmlstring, driver):

    currentDT = datetime.datetime.now()
    table_name = "maricopa"
    header = {
        'accept':
        '*/*',
        'accept-encoding':
        'gzip, deflate, br',
        'accept-language':
        'en-US,en;q=0.9,ko;q=0.8',
        'referer':
        'https://www.zillow.com/homes/85139_rb/',
        'sec-fetch-dest':
        'empty',
        'sec-fetch-mode':
        'cors',
        'sec-fetch-site':
        'same-origin',
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
    }

    url = "https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-113.68680328906252%2C%22east%22%3A-110.68753571093752%2C%22south%22%3A31.344189534984903%2C%22north%22%3A35.173746183166216%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A2402%2C%22regionType%22%3A4%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22mapZoom%22%3A8%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%7D"

    # url = "https://www.zillow.com/homes/85138_rb/"

    response = requests.get(url, headers=header)
    result = response.json()
    properties_infos = result["searchResults"]["mapResults"]
    print(len(properties_infos))

    for i in range(0, len(properties_infos)):

        data_base = []
        property_url = "https://www.zillow.com/" + properties_infos[i][
            "detailUrl"]
        status_text = properties_infos[i]["statusText"]
        try:
            street_add = properties_infos[i]["hdpData"]["homeInfo"][
                "streetAddress"]
        except:
            street_add = ""

        try:
            city = properties_infos[i]["hdpData"]["homeInfo"]["city"]
        except:
            city = ""

        try:
            state = properties_infos[i]["hdpData"]["homeInfo"]["state"]
        except:
            state = ""

        try:
            zipcode = properties_infos[i]["hdpData"]["homeInfo"]["zipcode"]
        except:
            zipcode = ""

        property_address = street_add + ", " + city + ", " + state + " " + zipcode

        if "by owner" in status_text:
            print("--------------------------------------------------> : ",
                  i + 1)
            print("Property Address--------------------> : ", property_address)
            print("Property Url------------------------> : ", property_url)
            print("Property Status---------------------> : ", status_text)

            driver.get(property_url)
            time.sleep(10)

            # phone_number = driver.find_element_by_xpath("//span[@class='listing-field']").text
            phones = re.findall(r'[(][\d]{3}[)][ ]?[\d]{3}-[\d]{4}',
                                driver.page_source)
            for phone in range(1, len(phones) + 1):
                phone_number = phones[phone - 1]
            print("Owner Phone Number------------------> : ", phone_number)

            with open("Zillow_Maricopan_AZ_only500.csv",
                      "a",
                      newline="",
                      encoding="utf-8") as f:
                writer = csv.writer(f)
                writer.writerow([property_address, status_text, phone_number])

            string_id = property_address + status_text + phone_number
            m = hashlib.md5()
            m.update(string_id.encode('utf8'))
            identifier = m.hexdigest()
            print("hash-------------------->", identifier)
            create_time = str(datetime.datetime.now())
            update_time = ""

            insertdb = InsertDB()
            data_base.append((property_address, street_add, city, state,
                              zipcode, status_text, phone_number, identifier,
                              create_time, update_time))
            insertdb.insert_document(data_base, table_name)
示例#11
0
def main(htmlstring, driver):
    table_name = "maricopa_30_08_2020"

    header = {
        'accept':
        '*/*',
        'accept-encoding':
        'gzip, deflate, br',
        'accept-language':
        'en-US,en;q=0.9,ko;q=0.8',
        'cookie':
        'zguid=23|%2410ab80e6-80db-4e0a-9f70-2449ca972d74; _ga=GA1.2.759159145.1599348167; zjs_user_id=null; zjs_anonymous_id=%2210ab80e6-80db-4e0a-9f70-2449ca972d74%22; _gcl_au=1.1.607943717.1599348169; _pxvid=be9ff2f0-efce-11ea-9652-0242ac12000b; __gads=ID=cab593cad6cbce43:T=1599348200:S=ALNI_MaFYrYCZZvPIITKUEoEDXGvXSRYwQ; _gid=GA1.2.1287304564.1599556314; _pin_unauth=dWlkPU9EUXdZamxrTldJdE9ESTBNUzAwWXprMExXSXdNekl0TkdWak0yWTFNVEE1TldJeSZycD1abUZzYzJV; ki_r=; ki_s=; _fbp=fb.1.1599562363584.1440832488; g_state={"i_p":1599570378147,"i_l":1}; ki_t=1599556892885%3B1599556892885%3B1599563330503%3B1%3B19; JSESSIONID=62F47C1DAFBF00B3DB7B301BEA3E6586; zgsession=1|8840c1ee-f8a6-43d7-9a7b-3169df33c987; _pxff_cc=U2FtZVNpdGU9TGF4Ow==; _pxff_rf=1; _pxff_fp=1; _pxff_bsco=1; _px3=6d722620cec81d0df86c8eff4b631bdd93cef163fb0a14808e80f81013747454:M7trNae6CpAztMArZT97P3Vy9jFLz9FuEZ5p2efYpXeqOJC7Bw+xzsVGxArAYe+PM+vQKNuEI3qytjutx2UEXg==:1000:M1Vo/kdU1lI8Zqky6jJnuwSu45xHxX8ueCLKUiW6KX8rNR+VWAORLQi+1ns4dhilOU7gSCJfJmToj1SeyKN49kHZQZIQ0wSFeFtn+txzkIo/fhFAr2Cq7WvjCVWw7GBx8F3JIjMqHf1BZAAFg0YXqy/IVuCFhvIioSyK35nkm4A=; _gat=1; KruxPixel=true; DoubleClickSession=true; _uetsid=f44fc66ca5c392a6859170ed776b6ae9; _uetvid=dc708dafb2b6d91ab6c6923ac1ae6673; AWSALB=3gLhoP6QCdmf4zskymQ7ej/kbqzRHNkv+QNQMFmS6Y7S9pENaOusdnQVhFHWm1W9z8/1Og/WmO8JK63ys0wmi6ZNwRc4SN8lf4pcoyrm+nj8lLAPLRDIqMaYAEte; AWSALBCORS=3gLhoP6QCdmf4zskymQ7ej/kbqzRHNkv+QNQMFmS6Y7S9pENaOusdnQVhFHWm1W9z8/1Og/WmO8JK63ys0wmi6ZNwRc4SN8lf4pcoyrm+nj8lLAPLRDIqMaYAEte; search=6|1602203173818%7Crb%3DMaricopa%252C-AZ%26rect%3D33.203401%252C-111.882231%252C32.788612%252C-112.512953%26disp%3Dmap%26mdm%3Dauto%26sort%3Ddays%26pt%3Dpmf%252Cpf%26fs%3D1%26fr%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%09%0932697%09%09%09%09%09%09',
        'sec-fetch-dest':
        'empty',
        'sec-fetch-mode':
        'cors',
        'sec-fetch-site':
        'same-origin',
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
    }

    pagination = ""
    usersSearchTerm = "85006"
    west = "-112.07973577801513"
    east = "-112.01665022198486"
    south = "33.43522122804253"
    north = "33.494937169247144"
    regionId = "94722"
    regionType = "7"
    mapZoom = "14"
    includeList = "true"

    # https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={%22pagination%22:{},%22usersSearchTerm%22:%2285006%22,%22mapBounds%22:{%22west%22:-112.07973577801513,%22east%22:-112.01665022198486,%22south%22:33.43522122804253,%22north%22:33.494937169247144},%22regionSelection%22:[{%22regionId%22:94722,%22regionType%22:7}],%22isMapVisible%22:true,%22mapZoom%22:14,%22filterState%22:{%22sort%22:{%22value%22:%22globalrelevanceex%22}},%22isListVisible%22:true}&includeMap=false&includeList=true

    default_first_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{0},"usersSearchTerm":"{1}","mapBounds":{"west":{2},"east":{3},"south":{4},"north":{5}},"regionSelection":[{"regionId":{6},"regionType":{7}}],"isMapVisible":true,"mapZoom":{8},"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList={9}'

    first_case_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{' + pagination + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList=' + includeList

    # first_url = default_first_url.format(pagination, usersSearchTerm, west, east, south, north, regionId, regionType, mapZoom, includeList)
    print(first_case_url)
    # return

    default_page_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":' + pagination + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList=' + includeList

    counts = 1

    for page in range(1, 4):

        default_page_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":' + str(
            page
        ) + '},' + '"usersSearchTerm":"' + usersSearchTerm + '","mapBounds":{"west":' + west + ',"east":' + east + ',"south":' + south + ',"north":' + north + '},"regionSelection":[{"regionId":' + regionId + ',"regionType":' + regionType + '}],"isMapVisible":true,"mapZoom":' + mapZoom + ',"filterState":{"sort":{"value":"globalrelevanceex"}},"isListVisible":true}&includeMap=false&includeList=' + includeList

        if page == 1:
            url = first_case_url
        else:
            url = default_page_url

        response = requests.get(url, headers=header)
        result = response.json()
        properties_infos = result["searchResults"]["listResults"]
        print(len(properties_infos))

        for i in range(0, len(properties_infos)):
            data_base = []
            property_url = properties_infos[i]["detailUrl"]
            status_text = properties_infos[i]["statusText"]
            print(status_text, counts)
            counts += 1
            try:
                street_add = properties_infos[i]["hdpData"]["homeInfo"][
                    "streetAddress"]
            except:
                street_add = ""

            try:
                city = properties_infos[i]["hdpData"]["homeInfo"]["city"]
            except:
                city = ""

            try:
                state = properties_infos[i]["hdpData"]["homeInfo"]["state"]
            except:
                state = ""

            try:
                zipcode = properties_infos[i]["hdpData"]["homeInfo"]["zipcode"]
            except:
                zipcode = ""

            property_address = street_add + ", " + city + ", " + state + " " + zipcode

            if "by owner" in status_text:
                print("--------------------------------------------------> : ",
                      i + 1)

                driver.get(property_url)
                time.sleep(10)

                # street_add = driver.find_element_by_xpath("//h1[@class='ds-address-container']/span[1]").text
                # property_address = street_add + ", " + city + ", " + state + " " + zipcode

                # phone_number = driver.find_element_by_xpath("//span[@class='listing-field']").text
                phones = re.findall(r'[(][\d]{3}[)][ ]?[\d]{3}-[\d]{4}',
                                    driver.page_source)
                for phone in range(1, len(phones) + 1):
                    phone_number = phones[phone - 1]

                print("Property Address--------------------> : ",
                      property_address)
                print("Property Url------------------------> : ", property_url)
                print("Property Status---------------------> : ", status_text)
                print("Owner Phone Number------------------> : ", phone_number)

                string_id = property_address + status_text + phone_number
                m = hashlib.md5()
                m.update(string_id.encode('utf8'))
                identifier = m.hexdigest()
                print("hash-------------------->", identifier)
                create_time = str(datetime.datetime.now())
                update_time = ""

                insertdb = InsertDB()
                data_base.append((property_address, street_add, city, state,
                                  zipcode, status_text, phone_number,
                                  identifier, create_time, update_time))
                insertdb.insert_document(data_base, table_name)