def start_driver():
    print("Starting Driver")
    driver = start_firefox(trulia, geckodriver_path, adblock_path, uBlock_path)
    sleep(5)

    try:
        driver.switch_to_window(driver.window_handles[1])
        driver.close()
        driver.switch_to_window(driver.window_handles[0])
        return driver
    except:
        print ("Switching window failed??")
        driver.quit()
        restart("logfile", debug, start)
def open_page(url):
    driver.delete_all_cookies()
    d = {}
    driver.get(url)
    print(driver.title)
    sleep(3)
    if "Real Estate, " in driver.title or "Not Found" in driver.title:
        print ("404 in trulia")
        return 1
    elif "Trulia" in driver.title:
        print ("Successfully loaded URL")
        return 0
    else:
        print ("Being blocked from accessing Trulia. Restarting...")
        driver.quit()
        restart("logfile", debug, start)
        return 1
def update_ejscreen(idx, debug_mode):
	print("Crawling ejscreen")
	d = {}
	driver.execute_script("window.open('https://ejscreen.epa.gov/mapper/mobile/', 'new_tab')")
	sleep(5)
	driver.switch_to_window(driver.window_handles[1])
	address = get_address(idx)
	try:
        	handle_ejscreen_input(driver, address)
        	sleep(3)
        	extract_pollution_from_report(driver, d)
	except:
		if debug_mode:
			driver.quit()
			for proc in psutil.process_iter():
				if proc.name() == "firefox" or proc.name() == "geckodriver":
					proc.kill()
			raise
		else:
			print("Cannot extract pollution. Restarting")
			driver.quit()
			restart("logfile", debug_mode, idx)

	write_ejscreen_to_file(idx, d)
예제 #4
0
def main(input_file, output_file, start, end, crawler_log, geckodriver_path, debug_mode, adblock_path, uBlock_path):
	urls = []

	df = pd.read_csv(input_file)

	driver = start_firefox(trulia, geckodriver_path, adblock_path, uBlock_path)
	sleep(5)

	try:
		driver.switch_to_window(driver.window_handles[1])
		driver.close()
		driver.switch_to_window(driver.window_handles[0])
	except:
		print ("switching window failed??")
		driver.quit()
		restart(crawler_log, debug_mode, start)

	i = int(start)
	count = 0
	for address in df["full"][int(start):int(end)]:
		if count == 60:
			os.system("sudo tmpreaper -m 1h /tmp")
			restart()
		try:
			print(i, address)

			# url1 = query(driver, "buy", address)
			url1 = ""
			url2 = query(driver, "rent", address)
			url3 = ""
			# url3 = query(driver, "sold", address)

			urls.append([url1, url2, url3])

			with open(output_file, "ab") as log:
		  		filewriter = csv.writer(log, delimiter = ',', quoting = csv.QUOTE_MINIMAL)
		  		filewriter.writerow([url1, url2, url3])
			with open(crawler_log, "ab") as log:
		  		filewriter = csv.writer(log, delimiter = ',', quoting = csv.QUOTE_MINIMAL)
		  		filewriter.writerow([i])

			i += 1
			count += 1
			
		except:
			if debug_mode:
				driver.quit()
				for proc in psutil.process_iter():
					if proc.name() == "firefox" or proc.name() == "geckodriver":
						proc.kill()
				raise
			else:
				driver.quit()
				restart(crawler_log, debug_mode, start)
	driver.quit()
예제 #5
0
def main(crawl_type, input_file, output_file, start, end, crawler_log,
         geckodriver_path, repair, debug_mode, adblock_path, uBlock_path):
    """Main function to do the crawling
	
	Args:
	    crawl_type (List of String): default ["U"]. Can add ["A", "L"]
	    input_file (String): Name of the input file
	    output_file (String): Name of the output file
	    start (int): Starting index of the crawling
	    end (int): Ending index of the crawling
	    crawler_log (String): Name of the log
	    geckodriver_path (String): Path to the geckodriver
	    repair (Bool): Wheather this crawling is repair mode or not
	    debug_mode (Bool): Wheater this crawling is debug mode or not
	    adblock_path (String): Path to the adblock
	    uBlock_path (String): Path to the uBlock
	"""

    driver = start_firefox(trulia, geckodriver_path, adblock_path, uBlock_path)

    sleep(5)

    try:
        driver.switch_to_window(driver.window_handles[1])
        driver.close()
        driver.switch_to_window(driver.window_handles[0])
    except:
        print("switching window failed??")
        driver.quit()
        restart(crawler_log, debug_mode, start)

    df = pd.read_csv(input_file)

    urls = df["URL"]

    if "L" in crawl_type:
        location = df["LatLon"]

    if "A" in crawl_type:
        address_col = df["Address"]

    if repair:
        df['Sqft'] = df['Sqft'].astype(str)
        df['Type'] = df['Type'].astype(str)
        df["Address"] = df["Address"].astype(str)
        df["City"] = df["City"].astype(str)
        df["State"] = df["State"].astype(str)
        df["Zip_Code"] = df["Zip_Code"].astype(str)
        df["Year"] = df["Year"].astype(str)
        df["Days_on_Trulia"] = df["Days_on_Trulia"].astype(str)
        df["Bedroom_min"] = df['Bedroom_min'].astype(str)
        df["Bedroom_max"] = df['Bedroom_max'].astype(str)
        df["Bathroom_min"] = df['Bathroom_min'].astype(str)
        df["Bathroom_max"] = df['Bathroom_max'].astype(str)
        df["Phone_Number"] = df['Phone_Number'].astype(str)
        df["URL"] = df["URL"].astype(str)

    try:
        for i in range(int(start), int(end)):
            print(i)
            print(urls[i])
            driver.delete_all_cookies()
            d = {}

            crawled_trulia = True
            driver.get(urls[i])
            print(driver.title)
            sleep(3)
            if "Real Estate, " in driver.title or "Not Found" in driver.title:
                print("404 in trulia")
                crawled_trulia = False
            elif "Trulia" in driver.title:
                print("Start crawling")
                try:
                    if repair:
                        flag = extract_rental(driver, d, "R", address_col[i],
                                              df, i)
                    elif "A" in crawl_type:
                        flag = extract_rental(driver,
                                              d,
                                              "A",
                                              address_col[i],
                                              index=i)
                    else:
                        flag = extract_rental(driver, d, "U")
                except:
                    if debug_mode:
                        driver.quit()
                        for proc in psutil.process_iter():
                            if proc.name() == "firefox" or proc.name(
                            ) == "geckodriver":
                                proc.kill()
                        raise
                    else:
                        driver.quit()
                        print("Reached EXCEPT after extract_rental")
                        restart(crawler_log, debug_mode, start)
                if flag == False:
                    crawled_trulia = False
            elif "this page" in driver.title.lower():
                print("Being blocked from accessing Trulia. Restarting...")
                driver.quit()
                restart(crawler_log, debug_mode, start)
            else:
                crawled_trulia = False
                address = driver.title.split(" - ")[0]
                print("Trulia is not available. Continuing")

            print("Trulia crawling done. Crawling ejscreen now")

            if repair:
                df.to_csv(input_file, index=False)
                with open(crawler_log, "ab") as log:
                    filewriter = csv.writer(log,
                                            delimiter=',',
                                            quoting=csv.QUOTE_MINIMAL)
                    filewriter.writerow([i])
                print("Repair done. going Next...")
                sleep(random.randint(10, 40))
                continue

            if "L" in crawl_type:
                address = location[i]
            elif "A" in crawl_type:
                address = address_col[i]
            elif "A" not in crawl_type:
                if crawled_trulia == False:
                    address = driver.title.split(" - ")[0]
                    if address.find("#") != -1:
                        address = address[:address.find("#")]
                    else:
                        address = address[:address.find("For")]
                else:
                    if d["address"].find('#') != -1:
                        add = d["address"][:d["address"].find('#')]
                    else:
                        add = d["address"]
                    address = add + ", " + d["city"] + ", " + d[
                        "state"] + " " + d["zip code"]
                if crawled_trulia == False and "Real Estate, " in driver.title:
                    address = "NA"

            driver.execute_script(
                "window.open('https://ejscreen.epa.gov/mapper/mobile/', 'new_tab')"
            )
            sleep(5)
            driver.switch_to_window(driver.window_handles[1])

            # if (len(address) < 10):
            # 	save_data(d, urls[i], output_file, crawl_type)
            # 	with open(crawler_log, "ab") as log:
            # 		filewriter = csv.writer(log, delimiter = ',', quoting = csv.QUOTE_MINIMAL)
            # 		filewriter.writerow([i])
            # 	continue

            try:
                handle_ejscreen_input(driver, address)
                sleep(5)
                extract_pollution_from_report(driver, d)
                #print("Skipping ejscreen")
            except:
                if debug_mode:
                    driver.quit()
                    for proc in psutil.process_iter():
                        if proc.name() == "firefox" or proc.name(
                        ) == "geckodriver":
                            proc.kill()
                    raise
                else:
                    print("cannot extract pollution. Restarting")
                    driver.quit()
                    restart(crawler_log, debug_mode, start)

            save_rental(d, urls[i], output_file)

            with open(crawler_log, "ab") as log:
                filewriter = csv.writer(log,
                                        delimiter=',',
                                        quoting=csv.QUOTE_MINIMAL)
                filewriter.writerow([i])

            driver.close()
            driver.switch_to_window(driver.window_handles[0])

            sleep(random.randint(10, 40))
    except:
        if debug_mode:
            driver.quit()
            for proc in psutil.process_iter():
                if proc.name() == "firefox" or proc.name() == "geckodriver":
                    proc.kill()
            raise
        else:
            driver.quit()
            restart(crawler_log, debug_mode, start)

    driver.quit()