Пример #1
0
def up(name, ema, pas):

    browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM)
    # connect to site
    browser.load_url(
        "https://www.udemy.com/join/signup-popup/?locale=en_US&response_type=html&next=https%3A%2F%2Fwww.udemy.com%2F",
        wait_on_page=5,
        wait_for_page_body=True)
    # find link button
    #reg_el = browser.find_element_by_link_text("Sign up")
    # https://www.udemy.com/join/login-popup/?locale=en_US&response_type=html&next=https%3A%2F%2Fwww.udemy.com%2F
    # click
    # reg_el.click()
    # enter full name
    full_name = browser.find_element_by_id("id_fullname")
    full_name.send_keys(name)
    # enter email
    email_el = browser.find_element_by_id("email--1")
    email_el.send_keys(ema)
    # enter password
    pass_el = browser.find_element_by_id("password")
    pass_el.send_keys(pas)
    # Scroll
    browser.execute_script("window.scrollBy(0,200)")
    browser.execute_script(
        'document.getElementById("id_subscribe_to_emails").checked = false')

    # find submit link
    sub_el = browser.find_element_by_id('submit-id-submit')
    # click submit
    sub_el.click()
    sleep(1)
    # check
    if 'occupation' in browser.current_url:
        # find submit link
        sleep(3)
        try:
            browser.execute_script(
                'document.getElementsByClassName("ot-sdk-container").sytle.display = "none"'
            )
        except:
            pass
        cl = browser.find_elements_by_class_name("udlite-btn")
        try:
            cl[0].click()
        except:
            browser.execute_script(
                'document.getElementsByClassName("ot-sdk-container").sytle.display = "none"'
            )
            cl[0].click()

        sleep(3)
        browser.close()
        return True
    if '=1' in browser.current_url:
        browser.close()
        return True
class TruliaHelper():

    def __init__(self):
        self.url = 'https://www.trulia.com'
        # need to set chrome path here.
        tbpath = "/home/XX/XXXX/tor-browser-linux64-8.0.8_en-US/tor-browser_en-US"
        self.driver = TorBrowserDriver(tbb_path=tbpath, tbb_logfile_path='test.log')
        # self.driver = webdriver.Firefox(firefox_profile=profile, firefox_binary=binary)
        # self.driver = webdriver.Chrome(executable_path='../utility/chromedriver.exe', chrome_options=chrome_options)

    # method to get items from given link.
    def getItems(self):
        df=pd.read_excel("/home/XXXXX/XXXXX/XXXXXX.xlsx")
        a=df['Site Address']
        b=df['Site City']
        c=df['Site State']
        d=df['Site Zip']
        items = []
        # keywords = ['512 W 10th St Perris CA 92570', 'New York, NY', 'San Francisco, CA', 'Washington, CA']
        for keyword in (pd.concat([a,b,c,d],axis=1)).values.tolist():
#         keywords = ['512 W 10th St Perris CA 92570'] * 10
#         for keyword in keywords:
            self.driver.get(self.url)
            search_box = self.driver.find_element_by_id("homepageSearchBoxTextInput")
            search_box.clear()
            search_box.send_keys(str(keyword))
            search_btn = self.driver.find_element_by_xpath("//button[@data-auto-test-id='searchButton']")
            if search_btn:
                search_btn.click()
                time.sleep(10)
                items.append(self.getItemDetail())
            # break
        self.driver.close()
        return items


    def getItemDetail(self):
        data = {}
        try:
            soup = BeautifulSoup(self.driver.page_source, u'html.parser')
            #image = soup.find("div", attrs={"class": "Tiles__TileBackground-fk0fs3-0 cSObNX"}).find("img")["src"]
            price = soup.find("div", attrs={"class": "Text__TextBase-sc-1cait9d-0-div Text__TextContainerBase-sc-1cait9d-1 hlvKRM"}).text
            # container = soup.find("div", attrs={"class": "resultsColumn"}).find("ul")
            # items = container.findAll("li", recursive=False)
            print(price)
        except:
            pass
        return data
  
    # method to start process.
    def start(self):
        items = self.getItems()
        print("Items : ",items)
Пример #3
0
def loggin(ema, pas):
    try:
        browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM)
    except:
        # selenium.common.exceptions.WebDriverException: Message: Access is denied. (os error 5)
        # mozilla is updating
        print('probably updating sleep 30')
        sleep(30)
        browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM)
    # connect to site
    try:
        browser.load_url(
            "https://www.udemy.com/join/login-popup/?locale=en_US&response_type=html&next=https%3A%2F%2Fwww.udemy.com%2F",
            wait_on_page=5,
            wait_for_page_body=True)

    except:
        # selenium.common.exceptions.NoSuchWindowException: Message: Browsing context has been discarded
        try:
            browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM)
        except:
            # selenium.common.exceptions.WebDriverException: Message: Access is denied. (os error 5)
            # mozilla is updating
            print('probably updating sleep 30')
            sleep(30)
            browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM)

        browser.load_url(
            "https://www.udemy.com/join/login-popup/?locale=en_US&response_type=html&next=https%3A%2F%2Fwww.udemy.com%2F",
            wait_on_page=5,
            wait_for_page_body=True)

    # reg_el.click()
    # maximise
    browser.maximize_window()
    # Scroll
    browser.execute_script("window.scrollTo(0,100)")
    try:
        email_el = browser.find_element_by_id("email--1")
    except:
        sleep(10)
        try:
            email_el = browser.find_element_by_id("email--1")
        except:
            return False
    email_el.send_keys(ema)
    # enter password
    pass_el = browser.find_element_by_id("id_password")
    pass_el.send_keys(pas)
    # find submit link
    sub_el = browser.find_element_by_id('submit-id-submit')
    # click submit
    sub_el.click()
    sleep(2)
    # check
    try:
        avatar = browser.find_element_by_id('u711-popover-trigger--18')
    except:
        avatar = None

    if avatar:
        return browser
    elif 'udemy.com' in browser.current_url:
        return browser
    else:
        return None
Пример #4
0
class TruliaHelper():
    def __init__(self):
        self.url = 'https://www.trulia.com'
        # need to set Tor Browser path here.
        tbpath = "/home/gc14/Documents/softwares/tor-browser_en-US"
        self.driver = TorBrowserDriver(tbb_path=tbpath,
                                       tbb_logfile_path='test.log')
        # self.driver = webdriver.Firefox(firefox_profile=profile, firefox_binary=binary)
        # self.driver = webdriver.Chrome(executable_path='../utility/chromedriver.exe', chrome_options=chrome_options)

    # method to get items from given link.
    def getItems(self):
        items = []
        # keywords = ['512 W 10th St Perris CA 92570', 'New York, NY', 'San Francisco, CA', 'Washington, CA']
        keywords = ['512 W 10th St Perris CA 92570'] * 2
        for keyword in keywords:
            self.driver.get(self.url)
            search_box = self.driver.find_element_by_id(
                "homepageSearchBoxTextInput")
            search_box.clear()
            search_box.send_keys(keyword)
            search_btn = self.driver.find_element_by_xpath(
                "//button[@data-auto-test-id='searchButton']")
            if search_btn:
                print("Going to click")
                search_btn.click()
                time.sleep(10)
                items.append(self.getItemDetail())

        self.driver.close()
        return items

    def getItemDetail(self):
        data = {}
        try:
            soup = BeautifulSoup(self.driver.page_source, u'html.parser')
            image = soup.find("div",
                              attrs={
                                  "class":
                                  "Tiles__TileBackground-fk0fs3-0 cSObNX"
                              }).find("img")["src"]
            price = soup.find(
                "div",
                attrs={
                    "class":
                    "Text__TextBase-sc-1cait9d-0-div Text__TextContainerBase-sc-1cait9d-1 hlvKRM"
                }).text
            # container = soup.find("div", attrs={"class": "resultsColumn"}).find("ul")
            # items = container.findAll("li", recursive=False)
            data.update({"image": image, "price": price})
        except:
            pass
        return data

    # method to write csv file
    def writeCSVFile(self, data):
        try:
            with open(
                    '/home/gc14/Documents/fiverr/custom_scrapers/home/trulia.csv',
                    mode='w') as csv_file:
                fieldnames = ['Image', 'Price']
                writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
                writer.writeheader()
                for d in data:
                    writer.writerow({'Image': d['image'], 'Price': d['price']})
                csv_file.close()
            print("File written successfully.")
        except:
            print(sys.exc_info())
            pass

    # method to start process.
    def start(self):
        items = self.getItems()
        print("Items : ", len(items))
        if items:
            self.writeCSVFile(items)