Exemplo n.º 1
0
    def __init__urls__(self):

        if self.topics is not None:

            if not isinstance(self.topics, list) and self.topics != 'all':

                self.topics: str

                self.topics = self.topics.split(',')
                self.topics = list(map(lambda _str: _str.strip(), self.topics))

            if self.topics != 'all':

                self.topics = list(map(lambda name: name.lower(), self.topics))

                # Log Info
                Logger.info('Topics : ' + ', '.join(self.topics))

            info = MediumScraper.main_urls['topics']

            self.get(info['url'])

            self.__get_topics_urls__()

        else:

            info = MediumScraper.main_urls['root']

            self.get(info['url'])

            self.__get_taps_urls__()
Exemplo n.º 2
0
    def export_data_json(self,
                         filename='posts_content.json',
                         overwrite=False,
                         indent_level=3,
                         sort_keys=False):

        if self.posts_content is not None:

            Writer.dict_to_json(json_filename=filename,
                                content=self.posts_content,
                                overwrite=overwrite,
                                indent_level=indent_level,
                                sort_keys=sort_keys)

        else:

            error_log = {
                'error_type': 'ValueError',
                'message': 'No data to export'
            }

            Logger.write_messages_json(error_log)

            # Log Error
            Logger.error('Export failed, Check log file')
Exemplo n.º 3
0
    def scrape_content_from_file(self,
                                 metadata_filename='posts_metadata.json',
                                 export_json=True,
                                 export_csv=True,
                                 export_overwrite=True,
                                 timeout_export=False,
                                 set_quit=True):

        try:

            _metadata = Reader.json_to_dict(metadata_filename)

            setattr(self, 'metadata', _metadata)

            if not hasattr(self, 'posts_content'):

                setattr(self, 'posts_content', dict())

            n_post = self.get_posts_count()

            Logger.info('No. of posts :', str(n_post))

            if n_post > 0:

                setattr(self, 'timeout_export', timeout_export)

                setattr(self, 'export_json', export_json)
                setattr(self, 'export_csv', export_csv)

                self.__get_data__()

            if export_json:

                self.export_data_json(filename='posts_content.json',
                                      overwrite=export_overwrite,
                                      indent_level=3,
                                      sort_keys=False)

            if export_csv:

                self.export_data_csv(filename='posts_content.csv',
                                     overwrite=export_overwrite)

        except (WebDriverException, ScraperException) as error:

            # Log error
            Logger.error(error)

            if set_quit:

                self.quit()

        else:

            if set_quit:

                self.quit()
Exemplo n.º 4
0
    def run(self,
            scrape_content=False,
            export_metadata_json=True,
            export_metadata_csv=True,
            export_data_json=True,
            export_data_csv=True,
            export_overwrite=True,
            set_quit=True):

        try:

            self.__get_posts_metadata__()

            Logger.info('No. of posts :', str(self.get_posts_count()))

            if export_metadata_json:

                self.export_metadata_json(filename='posts_metadata.json',
                                          overwrite=export_overwrite,
                                          indent_level=3,
                                          sort_keys=False)

            if export_metadata_csv:

                self.export_metadata_csv(filename='posts_metadata.csv',
                                         overwrite=export_overwrite)

            if scrape_content:

                self.__get_data__()

            if export_data_json:

                self.export_data_json(filename='posts_content.json',
                                      overwrite=export_overwrite,
                                      indent_level=3,
                                      sort_keys=False)

            if export_data_csv:

                self.export_data_csv(filename='posts_content.csv',
                                     overwrite=export_overwrite)

        except (WebDriverException, ScraperException) as error:

            # Log error
            Logger.error(error)

            if set_quit:

                self.quit()

        else:

            if set_quit:

                self.quit()
Exemplo n.º 5
0
    def export_data_csv(self, filename='posts_content.csv', overwrite=False):

        if self.posts_content is not None:

            Writer.dict_to_csv(csv_filename=filename,
                               content=self.posts_content,
                               overwrite=overwrite,
                               use_pandas=True)

        else:

            error_log = {
                'error_type': 'ValueError',
                'message': 'No data to export'
            }

            Logger.write_messages_json(error_log)

            # Log Error
            Logger.error('Export failed, Check log file')
Exemplo n.º 6
0
    def __init_web_driver__(self):

        self.__init_driver_options__()

        if self.browser == 'chrome':

            self.driver_path = self.__os_process__.locate_file(
                pattern='/chromedriver$',
                params='-i --regexp',
                updatedb=self.__updatedb__)[0]

            if self.driver_path is None:

                Logger.error(initialization_error_msg +
                             'Can\'t Find Chrome Driver Executable File')

            self.driver = webdriver.Chrome(options=self.options,
                                           executable_path=self.driver_path)

        elif self.browser == 'firefox':

            self.driver_path = self.__os_process__.locate_file(
                pattern='/geckodriver$',
                params='-i --regexp',
                updatedb=self.__updatedb__)[0]

            if self.driver_path is None:

                Logger.error(initialization_error_msg +
                             'Can\'t Find Firefox Driver Executable File')

            self.driver = webdriver.Firefox(options=self.options,
                                            executable_path=self.driver_path)

        else:  # Log Error
            pass
Exemplo n.º 7
0
    def export_metadata_csv(self, filename='posts_urls.csv', overwrite=False):

        if self.metadata is not None:

            keys = list(self.metadata.keys())

            metadata = {key: [] for key in self.metadata[keys[0]].keys()}
            metadata.setdefault('topic', [])

            for topic in self.metadata.keys():

                topic_name = [topic] * len(self.metadata[topic]['url'])

                metadata['topic'] += topic_name

                for key in self.metadata[topic].keys():

                    values = self.metadata[topic][key]
                    metadata[key] += values

            Writer.dict_to_csv(csv_filename=filename,
                               content=metadata,
                               overwrite=overwrite,
                               use_pandas=True)

        else:

            error_log = {
                'error_type': 'ValueError',
                'message': 'No urls to export'
            }

            Logger.write_messages_json(error_log)

            # Log Error
            Logger.error('Export failed, Check log file')
Exemplo n.º 8
0
        def get_post_content():

            ok = check_limited_access()

            if ok is True and \
                    self.ignore_limited_access is False:

                Logger.warning('You have a limited access :' + url)

                invalid = input('\tDo you want to continue - [y/n]: ')

                if invalid.lower() == 'n':

                    return

                elif invalid.lower() != 'y':

                    Logger.fail('Abort')

                    return

            elif ok is True and self.ignore_limited_access:

                Logger.warning('You have a limited access :' + url)

            elements_text = self.find_elements_by_xpath(xpath=text_xpath,
                                                        raise_error=False)
            elements_figure = self.find_elements_by_xpath(xpath=figure_xpath,
                                                          raise_error=False)

            text = get_text(elements_text)

            img_src, img_caption = get_figure(elements_figure)

            keys = list(self.posts_content.keys())

            if len(keys) == 0:

                self.posts_content = {
                    'url': [],
                    'text': [],
                    'img_src': [],
                    'caption': []
                }

            self.posts_content['url'].append(url)
            self.posts_content['text'].append(text)
            self.posts_content['img_src'].append(img_src)
            self.posts_content['caption'].append(img_caption)
Exemplo n.º 9
0
    def scroll_down(self, callback, delay=0.5, limit: int = -1, **meta):

        for i in range(limit):

            # scroll to - document.body.scrollHeight
            self.driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);")

            Logger.info_r(f'steps : {i+1}/{limit}')

            Requests.sleep(delay)

        Logger.info('', end='\n')
        Logger.set_line(length=50)

        outputs = callback(**meta)

        return outputs
Exemplo n.º 10
0
    def init_model(self, set_quit=True):

        try:

            self.__init__model__()

            Logger.info('No. of topics :', str(len(self.topics_urls)))
            Logger.set_line(length=50)

        except (WebDriverException, ScraperException) as error:

            # Log error
            Logger.error(error)

            if set_quit:

                self.quit()

        else:

            if set_quit:

                self.quit()
Exemplo n.º 11
0
    def __get_data__(self):

        if self.metadata is None:

            error_log = {
                'error_type': 'ValueError',
                'message': 'Not urls to iterate through'
            }
            Logger.write_messages_json(error_log)

            # Log Error
            Logger.error(initialization_error_msg)

            return None

        self.posts_content = dict()

        for topic, metadata in self.metadata.items():

            n_post = len(metadata['url'])

            Logger.info(f'Begin Scraping : {topic}')

            for i in range(n_post):

                Logger.info_r(f'scraped content : {i + 1}/{n_post}')

                self.__get_post_content__(url=metadata['url'][i])

            Logger.info(f'End Scraping : {topic}')
            Logger.set_line(length=50)
Exemplo n.º 12
0
    def get(self, url):

        self.driver.set_page_load_timeout(time_to_wait=self.time_to_wait)

        if 'script_timeout' in self.kwargs.keys():

            self.driver.set_script_timeout(
                time_to_wait=self.kwargs['script_timeout'])

        else:

            self.driver.set_script_timeout(0.001)

        for i in range(self.reload_page_count):

            try:

                self.driver.get(url=url)

                break

            except TimeoutException as error:

                if i < self.reload_page_count - 1:

                    Logger.fail(
                        str(i + 1) + ': timeout::page has been reloaded')
                    Logger.set_line(length=60)

                else:

                    Logger.fail(
                        str(i + 1) +
                        ': timeout::page reload Limit has been exceed\n'
                        '\tdo you want to try again - [y/n]: ',
                        end='')
                    ok = input()

                    Logger.set_line(length=60)

                    if ok.lower() == 'y':

                        self.time_to_wait = float(input('time to wait :'))
                        self.reload_page_count = int(input('reload count :'))

                        self.get(url)

                    elif ok.lower() == 'n':

                        self.___timeout_export__()
                        Logger.error(error)

                    else:

                        self.___timeout_export__()

                        Logger.fail('Abort')
                        Logger.error(error)

        self.scroll_height = self.driver.execute_script(
            "return document.body.scrollHeight")