def up(name, ema, pas): browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM) # connect to site browser.load_url( "https://www.udemy.com/join/signup-popup/?locale=en_US&response_type=html&next=https%3A%2F%2Fwww.udemy.com%2F", wait_on_page=5, wait_for_page_body=True) # find link button #reg_el = browser.find_element_by_link_text("Sign up") # https://www.udemy.com/join/login-popup/?locale=en_US&response_type=html&next=https%3A%2F%2Fwww.udemy.com%2F # click # reg_el.click() # enter full name full_name = browser.find_element_by_id("id_fullname") full_name.send_keys(name) # enter email email_el = browser.find_element_by_id("email--1") email_el.send_keys(ema) # enter password pass_el = browser.find_element_by_id("password") pass_el.send_keys(pas) # Scroll browser.execute_script("window.scrollBy(0,200)") browser.execute_script( 'document.getElementById("id_subscribe_to_emails").checked = false') # find submit link sub_el = browser.find_element_by_id('submit-id-submit') # click submit sub_el.click() sleep(1) # check if 'occupation' in browser.current_url: # find submit link sleep(3) try: browser.execute_script( 'document.getElementsByClassName("ot-sdk-container").sytle.display = "none"' ) except: pass cl = browser.find_elements_by_class_name("udlite-btn") try: cl[0].click() except: browser.execute_script( 'document.getElementsByClassName("ot-sdk-container").sytle.display = "none"' ) cl[0].click() sleep(3) browser.close() return True if '=1' in browser.current_url: browser.close() return True
class TruliaHelper(): def __init__(self): self.url = 'https://www.trulia.com' # need to set chrome path here. tbpath = "/home/XX/XXXX/tor-browser-linux64-8.0.8_en-US/tor-browser_en-US" self.driver = TorBrowserDriver(tbb_path=tbpath, tbb_logfile_path='test.log') # self.driver = webdriver.Firefox(firefox_profile=profile, firefox_binary=binary) # self.driver = webdriver.Chrome(executable_path='../utility/chromedriver.exe', chrome_options=chrome_options) # method to get items from given link. def getItems(self): df=pd.read_excel("/home/XXXXX/XXXXX/XXXXXX.xlsx") a=df['Site Address'] b=df['Site City'] c=df['Site State'] d=df['Site Zip'] items = [] # keywords = ['512 W 10th St Perris CA 92570', 'New York, NY', 'San Francisco, CA', 'Washington, CA'] for keyword in (pd.concat([a,b,c,d],axis=1)).values.tolist(): # keywords = ['512 W 10th St Perris CA 92570'] * 10 # for keyword in keywords: self.driver.get(self.url) search_box = self.driver.find_element_by_id("homepageSearchBoxTextInput") search_box.clear() search_box.send_keys(str(keyword)) search_btn = self.driver.find_element_by_xpath("//button[@data-auto-test-id='searchButton']") if search_btn: search_btn.click() time.sleep(10) items.append(self.getItemDetail()) # break self.driver.close() return items def getItemDetail(self): data = {} try: soup = BeautifulSoup(self.driver.page_source, u'html.parser') #image = soup.find("div", attrs={"class": "Tiles__TileBackground-fk0fs3-0 cSObNX"}).find("img")["src"] price = soup.find("div", attrs={"class": "Text__TextBase-sc-1cait9d-0-div Text__TextContainerBase-sc-1cait9d-1 hlvKRM"}).text # container = soup.find("div", attrs={"class": "resultsColumn"}).find("ul") # items = container.findAll("li", recursive=False) print(price) except: pass return data # method to start process. def start(self): items = self.getItems() print("Items : ",items)
def loggin(ema, pas): try: browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM) except: # selenium.common.exceptions.WebDriverException: Message: Access is denied. (os error 5) # mozilla is updating print('probably updating sleep 30') sleep(30) browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM) # connect to site try: browser.load_url( "https://www.udemy.com/join/login-popup/?locale=en_US&response_type=html&next=https%3A%2F%2Fwww.udemy.com%2F", wait_on_page=5, wait_for_page_body=True) except: # selenium.common.exceptions.NoSuchWindowException: Message: Browsing context has been discarded try: browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM) except: # selenium.common.exceptions.WebDriverException: Message: Access is denied. (os error 5) # mozilla is updating print('probably updating sleep 30') sleep(30) browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM) browser.load_url( "https://www.udemy.com/join/login-popup/?locale=en_US&response_type=html&next=https%3A%2F%2Fwww.udemy.com%2F", wait_on_page=5, wait_for_page_body=True) # reg_el.click() # maximise browser.maximize_window() # Scroll browser.execute_script("window.scrollTo(0,100)") try: email_el = browser.find_element_by_id("email--1") except: sleep(10) try: email_el = browser.find_element_by_id("email--1") except: return False email_el.send_keys(ema) # enter password pass_el = browser.find_element_by_id("id_password") pass_el.send_keys(pas) # find submit link sub_el = browser.find_element_by_id('submit-id-submit') # click submit sub_el.click() sleep(2) # check try: avatar = browser.find_element_by_id('u711-popover-trigger--18') except: avatar = None if avatar: return browser elif 'udemy.com' in browser.current_url: return browser else: return None
class TruliaHelper(): def __init__(self): self.url = 'https://www.trulia.com' # need to set Tor Browser path here. tbpath = "/home/gc14/Documents/softwares/tor-browser_en-US" self.driver = TorBrowserDriver(tbb_path=tbpath, tbb_logfile_path='test.log') # self.driver = webdriver.Firefox(firefox_profile=profile, firefox_binary=binary) # self.driver = webdriver.Chrome(executable_path='../utility/chromedriver.exe', chrome_options=chrome_options) # method to get items from given link. def getItems(self): items = [] # keywords = ['512 W 10th St Perris CA 92570', 'New York, NY', 'San Francisco, CA', 'Washington, CA'] keywords = ['512 W 10th St Perris CA 92570'] * 2 for keyword in keywords: self.driver.get(self.url) search_box = self.driver.find_element_by_id( "homepageSearchBoxTextInput") search_box.clear() search_box.send_keys(keyword) search_btn = self.driver.find_element_by_xpath( "//button[@data-auto-test-id='searchButton']") if search_btn: print("Going to click") search_btn.click() time.sleep(10) items.append(self.getItemDetail()) self.driver.close() return items def getItemDetail(self): data = {} try: soup = BeautifulSoup(self.driver.page_source, u'html.parser') image = soup.find("div", attrs={ "class": "Tiles__TileBackground-fk0fs3-0 cSObNX" }).find("img")["src"] price = soup.find( "div", attrs={ "class": "Text__TextBase-sc-1cait9d-0-div Text__TextContainerBase-sc-1cait9d-1 hlvKRM" }).text # container = soup.find("div", attrs={"class": "resultsColumn"}).find("ul") # items = container.findAll("li", recursive=False) data.update({"image": image, "price": price}) except: pass return data # method to write csv file def writeCSVFile(self, data): try: with open( '/home/gc14/Documents/fiverr/custom_scrapers/home/trulia.csv', mode='w') as csv_file: fieldnames = ['Image', 'Price'] writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() for d in data: writer.writerow({'Image': d['image'], 'Price': d['price']}) csv_file.close() print("File written successfully.") except: print(sys.exc_info()) pass # method to start process. def start(self): items = self.getItems() print("Items : ", len(items)) if items: self.writeCSVFile(items)