Exemplo n.º 1
0
    def get_single_element_BY(self, target):
        element = None
        assert isinstance(
            target,
            dict), 'Target must be a dictionary with key:value [type:target]'
        try:
            c_target = target.copy()
            _type, _target = c_target.popitem()

            if _type is CLASS:
                element = self.__get_element_by_class(_target)
            elif _type is ID:
                element = self.__get_element_by_id(_target)
            elif _type is XPATH:
                element = self.__get_element_by_xpath(_target)
            elif _type is TAG:
                element = self.__get_element_by_tag(_target)
            else:
                print('Error - Type ( {} ).'.format(_type))

        except Exception as e:
            print(
                'Exception: {}: {} -> {} - is not present in page: {}'.format(
                    e, c.blue(_type), c.red(_target),
                    c.orange(self.webBrowser.current_url)))

        finally:
            if self.log and element is not None:
                print('{} ---> {}'.format(c.blue(_type), c.green(_target)))
            return element
Exemplo n.º 2
0
    def find_elements_BY(self, element, target):
        nested_element = None
        assert isinstance(
            target,
            dict), 'Target must be a dictionary with key:value [type:target]'
        try:
            c_target = target.copy()
            _type, _target = c_target.popitem()

            if _type is CLASS:
                nested_element = self.__find_elements_by_class(
                    element, _target)
            if _type is ID:
                nested_element = self.__find_elements_by_id(element, _target)
            if _type is XPATH:
                nested_element = self.__find_elements_by_xpath(
                    element, _target)
            if _type is TAG:
                nested_element = self.__find_elements_by_tag(element, _target)
            if _type is LINK:
                nested_element = self.__find_elements_by_partial_link_text(
                    element, _target)

        except Exception as e:
            print(
                'Exception: {}: {} -> {} - is not present in page: {}'.format(
                    e, c.blue(_type), c.red(_target),
                    c.orange(self.webBrowser.current_url)))

        finally:
            if self.log and nested_element is not None:
                print('{} ---> {}'.format(c.blue(_type), c.green(_target)))
            return nested_element
Exemplo n.º 3
0
    def __init__(self, delay=10, log=False, headless=False, tor=False):
        # Binary path
        path = pkg_resources.resource_filename(pkg_name, geckodriver_file)
        profile = webdriver.FirefoxProfile()
        if tor:
            print(c.red('Warning '), c.blue(
                'Be sure you have Tor browser opened in backgroud!'))
            print('Tor Proxy', c.green('Enabled'))
            profile.set_preference('network.proxy.type', 1)
            profile.set_preference('network.proxy.socks', '127.0.0.1')
            profile.set_preference('network.proxy.socks_port', 9150)
        profile.set_preference("browser.cache.disk.enable", False)
        profile.set_preference("browser.cache.memory.enable", False)
        profile.set_preference("browser.cache.offline.enable", False)
        profile.set_preference("network.http.use-cache", False)
        options = Options()
        if headless:
            options.add_argument('-headless')
        self.webBrowser = webdriver.Firefox(
            firefox_profile=profile, executable_path=path, firefox_options=options)

        self.delay = delay
        self.log = log
        self.log_file = 'geckodriver.log'
        self.wait = WebDriverWait(self.webBrowser, timeout=delay)
Exemplo n.º 4
0
 def area(self, idx=None):
     D = c.green('@area')
     area = Page.scraper.get_element_BY(Target.Search.area)
     view = Select(area)
     options = view.options
     print(c.underline('Geographic Area:'))
     for i, op in enumerate(options):
         print('[{}] - {}'.format(c.orange(str(i)), c.blue(op.text)))
     if idx is None:
         idx = int(input('insert the number [i] -> '))
     chs = options[idx].text
     options[idx].click()
     print(Page.I + self.I + D, c.underline(chs))
Exemplo n.º 5
0
 def catagory(self, idx=None):
     D = c.green('@catagory')
     category = Page.scraper.get_element_BY(Target.Search.category)
     view = Select(category)
     options = view.options
     print(c.underline('Category:'))
     for i, op in enumerate(options):
         print('[{}] - {}'.format(c.orange(str(i)), c.blue(op.text)))
     if idx is None:
         idx = int(input('\nInsert number [i] -> '))
     chs = options[idx].text
     options[idx].click()
     print(Page.I + self.I + D, c.underline(chs))
Exemplo n.º 6
0
 def info(self):
     return '| ' + str(self.entityName) + '| ' + c.blue(
         str(self.user['username']))
Exemplo n.º 7
0
 def radio(radio):
     sys.stdout.write(('\n\n|Radio: {} '.format(c.blue(radio.name))))
Exemplo n.º 8
0
class Page:
    scraper = Scraper(log=LOG, headless=HEADLESS, tor=TOR)
    I = c.blue('Page/')

    class Search:
        I = c.orange('Search/')

        def goto_url(self):
            D = c.green('@goto_url')
            url = 'https://www.subito.it'
            Page.scraper.openUrl(url)
            print(Page.I + self.I + D, c.underline(url))

        def what(self, text=None):
            D = c.green('@what')
            if text is None:
                text = input('What you want search ->  ')
            search = Page.scraper.get_element_BY(Target.Search.name)
            search.send_keys(text)
            print(Page.I + self.I + D, c.underline(text))

        def catagory(self, idx=None):
            D = c.green('@catagory')
            category = Page.scraper.get_element_BY(Target.Search.category)
            view = Select(category)
            options = view.options
            print(c.underline('Category:'))
            for i, op in enumerate(options):
                print('[{}] - {}'.format(c.orange(str(i)), c.blue(op.text)))
            if idx is None:
                idx = int(input('\nInsert number [i] -> '))
            chs = options[idx].text
            options[idx].click()
            print(Page.I + self.I + D, c.underline(chs))

        def area(self, idx=None):
            D = c.green('@area')
            area = Page.scraper.get_element_BY(Target.Search.area)
            view = Select(area)
            options = view.options
            print(c.underline('Geographic Area:'))
            for i, op in enumerate(options):
                print('[{}] - {}'.format(c.orange(str(i)), c.blue(op.text)))
            if idx is None:
                idx = int(input('insert the number [i] -> '))
            chs = options[idx].text
            options[idx].click()
            print(Page.I + self.I + D, c.underline(chs))

        def continue_button(self):
            D = c.green('@continue_button')
            b_continue = Page.scraper.get_element_BY(
                Target.Search.button_continue)
            b_continue.click()
            print(Page.I + self.I + D, c.underline('Continue'))

    class ListAd:
        I = c.orange('ListAd/')

        def links(self):
            D = c.green('@links')
            links = []
            raw_ads = Page.scraper.get_nested_elements_from_root(
                Target.ListAd.list_link_element)
            for raw_ad in raw_ads:
                raw_link = Page.scraper.find_elements_BY(
                    raw_ad, Target.ListAd.raw_link)
                link = Page.scraper.find_elements_BY(raw_link[0],
                                                     Target.ListAd.link)
                links.append(link)
            print(Page.I + self.I + D, 'Founded links: ',
                  c.underline(str(len(links))))
            return links

        def next(self, check=False):
            D = c.green('@next')
            next_b = Page.scraper.get_element_BY(Target.ListAd.button_next)
            link = Page.scraper.find_elements_BY(next_b, Target.ListAd.link)
            if link:
                if check:
                    return True

                Page.scraper.openUrl(link)
                print(Page.I + self.I + D, c.underline('Next'))
                return True
            print(Page.I + self.I + D, c.red('Finish'))
            return False

        def index_page(self):
            idx = Page.scraper.get_element_BY(
                Target.ListAd.number_page).get_attribute('innerHTML')
            idx = between('<strong>', '</strong>', idx)
            return idx

        def pages_links(self, chs=None):

            if chs is None:
                chs = input(
                    'How many page you want scrab? type: int or "all" \n')

            if chs == 'all':
                chs = 10000
            else:
                try:
                    chs = int(chs)
                except ValueError:
                    print('type: int or "all"')

            links = dict()
            for i in tqdm(range(chs), desc='Pages: '):
                idx = self.index_page()
                links[idx] = self.links()
                if self.next() is False:
                    break
            return links

    class Ad:
        I = c.orange('Ad/')

        def record(self, url, page_idx):
            record = dict()
            record['page_index'] = page_idx
            record['url'] = url
            Page.scraper.openUrl(url)
            # General Info
            record['info'] = Page.scraper.get_element_BY(
                Target.Ad.summary).text.split('\n')
            # Description
            record['description'] = Page.scraper.get_element_BY(
                Target.Ad.description).text
            # Date - name
            record['date'] = Page.scraper.get_element_BY(
                Target.Ad.date_name).text.split('\n')[0]
            record['name'] = Page.scraper.get_element_BY(
                Target.Ad.date_name).text.split('\n')[1]
            # Phone
            phone_button = Page.scraper.get_element_BY(Target.Ad.phone_button)
            if phone_button:
                Page.scraper.webBrowser.execute_script("arguments[0].click();",
                                                       phone_button)
                sleep(1)
                number_id = dict(ID='adv_phone_big')
                record['phone'] = Page.scraper.get_element_BY(
                    Target.Ad.phone_number).text
            else:
                record['phone'] = 'None'

            return record

        def records_from_ad_links(self, ad_links):
            temp_db = []
            for page, list_link in tqdm(ad_links.items(), desc='Pages: '):
                for url in tqdm(list_link, desc='Links: '):
                    try:
                        temp_db.append(self.record(url, page))
                    except Exception as e:
                        pass

            return temp_db