예제 #1
0
import os
import platform
import sys
import tarfile
import urllib.request
import zipfile
from scrapcore.logger import Logger

logger = Logger()
logger.setup_logger()
logger = logger.get_logger()


class PhantomInstall():

    home_dir = os.path.expanduser('phantomjs/')
    binary_win = 'phantomjs-2.1.1-windows/bin/phantomjs.exe'
    binary_linux64 = 'phantomjs-2.1.1-linux-x86_64/bin/phantomjs'
    binary_linux32 = 'phantomjs-2.1.1-linux-i686/bin/phantomjs'

    def get_os(self):
        return platform.system()

    def detect_phantomjs(self):
        logger.info('detecting phantomjs')
        this_os = self.get_os().lower()
        if 'windows' in this_os:
            if os.path.isfile(self.home_dir + self.binary_win):
                return self.home_dir + self.binary_win
            else:
                return False
예제 #2
0
    def main(self, return_results=False, config=None):
        """the main method"""

        use_control = config.get('use_control')
        logger = Logger()
        logger.setup_logger(level=config.get('log_level').upper())
        self.logger = logger.get_logger()
        proxy_file = config.get('proxy_file', '')

        search_instances = config.get('search_instances', [{
            'engine': 'google'
        }])
        if not isinstance(search_instances, list):
            raise ValueError(
                'Please provide a list of search instance objects')

        num_search_instances = len(search_instances)
        num_workers = int(config.get('num_workers'))
        scrape_method = config.get('scrape_method')
        pages = int(config.get('num_pages_for_keyword', 1))
        method = config.get('scrape_method', 'selenium')

        all_keyword_objs = config.get('keywords', [])
        scraper_searches = []
        for index, keyword_obj in enumerate(all_keyword_objs):
            # old code treated keywords as a LIST when passing to ScrapeJobGenerator, etc
            single_keyword_as_list = [keyword_obj['keyword']]
            category = keyword_obj['category']

            result_writer = ResultWriter()
            result_writer.init_outfile(config, force_reload=True)
            cache_manager = CacheManager(config, self.logger, result_writer)

            scrape_jobs = ScrapeJobGenerator().get(single_keyword_as_list,
                                                   search_instances,
                                                   scrape_method, pages)
            scrape_jobs = list(scrape_jobs)

            if use_control:
                control_jobs = ScrapeJobGenerator().get(
                    single_keyword_as_list, search_instances, scrape_method,
                    pages)
            else:
                control_jobs = []
            control_jobs = list(control_jobs)
            proxies = []

            if config.get('use_own_ip'):
                proxies.append(None)
            elif proxy_file:
                proxies = Proxies().parse_proxy_file(proxy_file)

            if not proxies:
                raise Exception('''No proxies available. Turning down.''')
            shuffle(proxies)

            # get a scoped sqlalchemy session
            session_cls = get_session(config, scoped=True)
            session = session_cls()

            # add fixtures
            fixtures(config, session)

            # add proxies to the database
            Proxies().add_proxies_to_db(proxies, session)

            scraper_search = ScraperSearch(
                number_search_instances_used=num_search_instances,
                number_proxies_used=len(proxies),
                number_search_queries=len(single_keyword_as_list),
                started_searching=datetime.datetime.utcnow(),
                used_search_instances=','.join(
                    [instance['engine'] for instance in search_instances]))

            # first check cache
            if config.get('do_caching'):
                scrape_jobs = cache_manager.filter_scrape_jobs(
                    scrape_jobs, session, scraper_search)
            if scrape_jobs:
                # Create a lock to synchronize database
                # access in the sqlalchemy session
                db_lock = threading.Lock()

                # create a lock to cache results
                cache_lock = threading.Lock()

                # A lock to prevent multiple threads from solving captcha,
                # used in selenium instances.
                captcha_lock = threading.Lock()

                # self.logger.info(
                #     '''
                #     Going to scrape {num_keywords} single_keyword_as_list with {num_proxies}
                #     proxies by using {num_threads} threads.
                #     '''.format(
                #         num_keywords=len(scrape_jobs),
                #         num_proxies=len(proxies),
                #         num_threads=num_search_instances)
                #     )

                progress_thread = None

                # Show the progress of the scraping
                q = queue.Queue()
                progress_thread = ShowProgressQueue(config, q,
                                                    len(scrape_jobs))
                progress_thread.start()

                workers = queue.Queue()
                control_workers = queue.Queue()
                num_worker = 0

                for _, search_instance in enumerate(search_instances):
                    for proxy in proxies:
                        for worker in range(num_workers):
                            num_worker += 1
                            workers.put(
                                ScrapeWorkerFactory(
                                    config,
                                    cache_manager=cache_manager,
                                    mode=method,
                                    proxy=proxy,
                                    search_instance=search_instance,
                                    session=session,
                                    db_lock=db_lock,
                                    cache_lock=cache_lock,
                                    scraper_search=scraper_search,
                                    captcha_lock=captcha_lock,
                                    progress_queue=q,
                                    browser_num=num_worker))
                            if use_control:
                                control_workers.put(
                                    ScrapeWorkerFactory(
                                        config,
                                        cache_manager=cache_manager,
                                        mode=method,
                                        proxy=proxy,
                                        search_instance=search_instance,
                                        session=session,
                                        db_lock=db_lock,
                                        cache_lock=cache_lock,
                                        scraper_search=scraper_search,
                                        captcha_lock=captcha_lock,
                                        progress_queue=q,
                                        browser_num=num_worker))

                # here we look for suitable workers
                # for all jobs created.
                for (joblist, workerq) in [(scrape_jobs, workers),
                                           (control_jobs, control_workers)]:
                    for job in joblist:
                        while True:
                            worker = workerq.get()
                            workerq.put(worker)
                            if worker.is_suitable(job):
                                worker.add_job(job)
                                break

                threads, control_threads = [], []
                for (threadlist, workerq) in [(threads, workers),
                                              (control_threads,
                                               control_workers)]:
                    while not workerq.empty():
                        worker = workerq.get()
                        thread = worker.get_worker()
                        if thread:
                            threadlist.append(thread)

                if len(threads) != len(control_threads) and use_control:
                    q.put('done')
                    progress_thread.join()
                    raise ValueError(
                        "Something went wrong w/ threads, check config")

                if use_control:
                    for thread, control_thread in zip(threads,
                                                      control_threads):
                        thread.start()
                        thread.mark_category(category)
                        control_thread.mark_as_control()
                        control_thread.start()
                        control_thread.mark_category(category)
                else:
                    for thread in threads:
                        thread.start()
                        thread.mark_category(category)
                        time.sleep(BETWEEN_THREADS)
                        time.sleep(randrange(0, RAND_MAX))
                for thread in threads:
                    thread.join()
                for thread in control_threads:
                    thread.join()

                # after threads are done, stop the progress queue.
                q.put('done')
                progress_thread.join()

            result_writer.close_outfile()

            scraper_search.stopped_searching = datetime.datetime.utcnow()
            session.add(scraper_search)
            session.commit()
            scraper_searches.append(scraper_search)
            print('Finished with the keyword {}'.format(
                str(single_keyword_as_list)))
            if index != len(all_keyword_objs) - 1:
                sleep_mins = len(threads) + len(control_threads)
                print("""
                    Going to sleep 1 minute per query made, for a total of {} minutes
                    """.format(sleep_mins))
                time.sleep((60 - BETWEEN_THREADS) * sleep_mins)

        if return_results:
            return scraper_searches
예제 #3
0
파일: core.py 프로젝트: zenospav/SerpScrap
    def main(self, return_results=False, config=None):
        """the main method"""

        logger = Logger()
        logger.setup_logger(level=config.get('log_level').upper())
        self.logger = logger.get_logger()

        keywords = set(config.get('keywords', []))
        proxy_file = config.get('proxy_file', '')

        # when no search engine is specified, use google
        search_engines = config.get('search_engines', ['google'])
        if not isinstance(search_engines, list):
            if search_engines == '*':
                search_engines = config.get('supported_search_engines')
            else:
                search_engines = search_engines.split(',')
        search_engines = set(search_engines)

        num_search_engines = len(search_engines)
        num_workers = int(config.get('num_workers'))
        scrape_method = config.get('scrape_method')
        pages = int(config.get('num_pages_for_keyword', 1))
        method = config.get('scrape_method', 'selenium')

        result_writer = ResultWriter()
        result_writer.init_outfile(config, force_reload=True)

        cache_manager = CacheManager(config, self.logger, result_writer)

        scrape_jobs = {}

        if not scrape_jobs:
            scrape_jobs = ScrapeJobGenerator().get(keywords, search_engines,
                                                   scrape_method, pages)

        scrape_jobs = list(scrape_jobs)

        proxies = []

        if config.get('use_own_ip'):
            proxies.append(None)
        elif proxy_file:
            proxies = Proxies().parse_proxy_file(proxy_file)

        if not proxies:
            raise Exception('''No proxies available. Turning down.''')
        shuffle(proxies)

        # get a scoped sqlalchemy session
        session_cls = get_session(config, scoped=True)
        session = session_cls()

        # add fixtures
        fixtures(config, session)

        # add proxies to the database
        Proxies().add_proxies_to_db(proxies, session)

        scraper_search = ScraperSearch(
            number_search_engines_used=num_search_engines,
            number_proxies_used=len(proxies),
            number_search_queries=len(keywords),
            started_searching=datetime.datetime.utcnow(),
            used_search_engines=','.join(search_engines))

        # First of all, lets see how many requests remain
        # to issue after searching the cache.
        if config.get('do_caching'):
            scrape_jobs = cache_manager.filter_scrape_jobs(
                scrape_jobs, session, scraper_search)

        if scrape_jobs:

            # Create a lock to synchronize database
            # access in the sqlalchemy session
            db_lock = threading.Lock()

            # create a lock to cache results
            cache_lock = threading.Lock()

            # A lock to prevent multiple threads from solving captcha,
            # used in selenium instances.
            captcha_lock = threading.Lock()

            self.logger.info('''
                Going to scrape {num_keywords} keywords with {num_proxies}
                proxies by using {num_threads} threads.'''.format(
                num_keywords=len(list(scrape_jobs)),
                num_proxies=len(proxies),
                num_threads=num_search_engines))

            progress_thread = None

            # Show the progress of the scraping
            q = queue.Queue()
            progress_thread = ShowProgressQueue(config, q, len(scrape_jobs))
            progress_thread.start()

            workers = queue.Queue()
            num_worker = 0
            for search_engine in search_engines:

                for proxy in proxies:
                    for worker in range(num_workers):
                        num_worker += 1
                        workers.put(
                            ScrapeWorkerFactory(config,
                                                cache_manager=cache_manager,
                                                mode=method,
                                                proxy=proxy,
                                                search_engine=search_engine,
                                                session=session,
                                                db_lock=db_lock,
                                                cache_lock=cache_lock,
                                                scraper_search=scraper_search,
                                                captcha_lock=captcha_lock,
                                                progress_queue=q,
                                                browser_num=num_worker))

            # here we look for suitable workers
            # for all jobs created.
            for job in scrape_jobs:
                while True:
                    worker = workers.get()
                    workers.put(worker)
                    if worker.is_suitabe(job):
                        worker.add_job(job)
                        break

            threads = []

            while not workers.empty():
                worker = workers.get()
                thread = worker.get_worker()
                if thread:
                    threads.append(thread)

            for t in threads:
                t.start()

            for t in threads:
                t.join()

            # after threads are done, stop the progress queue.
            q.put('done')
            progress_thread.join()

        result_writer.close_outfile()

        scraper_search.stopped_searching = datetime.datetime.utcnow()
        try:
            session.add(scraper_search)
            session.commit()
        except Exception:
            pass

        if return_results:
            return scraper_search
예제 #4
0
파일: core.py 프로젝트: DrSn2/SerpScrap
    def main(self, return_results=False, config=None):
        """the main method"""

        logger = Logger()
        logger.setup_logger(level=config.get('log_level').upper())
        self.logger = logger.get_logger()

#         kwfile = config.get('keyword_file', '')
#         if kwfile:
#             kwfile = os.path.abspath(kwfile)
        kwfile = None

        keywords = set(config.get('keywords', []))
        proxy_file = config.get('proxy_file', '')

        # when no search engine is specified, use google
        search_engines = config.get('search_engines', ['google'])
        if not isinstance(search_engines, list):
            if search_engines == '*':
                search_engines = config.get('supported_search_engines')
            else:
                search_engines = search_engines.split(',')
        search_engines = set(search_engines)

        num_search_engines = len(search_engines)
        num_workers = int(config.get('num_workers'))
        scrape_method = config.get('scrape_method')
        pages = int(config.get('num_pages_for_keyword', 1))
        method = config.get('scrape_method', 'selenium')

        result_writer = ResultWriter()
        result_writer.init_outfile(config, force_reload=True)

        cache_manager = CacheManager(config, self.logger, result_writer)

        scrape_jobs = {}
#         if kwfile:
#             if not os.path.exists(kwfile):
#                 raise WrongConfigurationError('The keyword file {} does not exist.'.format(kwfile))
#             else:
#                 if kwfile.endswith('.py'):
#                     # we need to import the variable "scrape_jobs" from the module.
#                     sys.path.append(os.path.dirname(kwfile))
#                     try:
#                         modname = os.path.split(kwfile)[-1].rstrip('.py')
#                         scrape_jobs = getattr(__import__(modname, fromlist=['scrape_jobs']), 'scrape_jobs')
#                     except ImportError as e:
#                         logger.warning(e)
#                 else:
#                     # Clean the keywords of duplicates right in the beginning
#                     keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n') if line.strip()])

        if not scrape_jobs:
            scrape_jobs = ScrapeJobGenerator().get(
                keywords,
                search_engines,
                scrape_method,
                pages
            )

        scrape_jobs = list(scrape_jobs)

        proxies = []

        if config.get('use_own_ip'):
            proxies.append(None)
        elif proxy_file:
            proxies = Proxies().parse_proxy_file(proxy_file)

        if not proxies:
            raise Exception('''No proxies available. Turning down.''')

        # get a scoped sqlalchemy session
        session_cls = get_session(config, scoped=False)
        session = session_cls()

        # add fixtures
        fixtures(config, session)

        # add proxies to the database
        Proxies().add_proxies_to_db(proxies, session)

        # ask the user to continue the last scrape. We detect a continuation of a
        # previously established scrape, if the keyword-file is the same and unmodified since
        # the beginning of the last scrape.
        scraper_search = None
#         if kwfile and config.get('continue_last_scrape', False):
#             searches = session.query(ScraperSearch). \
#                 filter(ScraperSearch.keyword_file == kwfile). \
#                 order_by(ScraperSearch.started_searching). \
#                 all()
#
#             if searches:
#                 last_search = searches[-1]
#                 last_modified = datetime.datetime.utcfromtimestamp(os.path.getmtime(last_search.keyword_file))
#
#                 # if the last modification is older then the starting of the search
#                 if last_modified < last_search.started_searching:
#                     scraper_search = last_search
#                     logger.info('Continuing last scrape.')

        if not scraper_search:
            scraper_search = ScraperSearch(
                keyword_file=kwfile,
                number_search_engines_used=num_search_engines,
                number_proxies_used=len(proxies),
                number_search_queries=len(keywords),
                started_searching=datetime.datetime.utcnow(),
                used_search_engines=','.join(search_engines)
            )

        # First of all, lets see how many requests remain
        # to issue after searching the cache.
        if config.get('do_caching'):
            scrape_jobs = cache_manager.filter_scrape_jobs(
                scrape_jobs,
                session,
                scraper_search
            )

        if scrape_jobs:

            # Create a lock to synchronize database
            # access in the sqlalchemy session
            db_lock = threading.Lock()

            # create a lock to cache results
            cache_lock = threading.Lock()

            # A lock to prevent multiple threads from solving captcha,
            # used in selenium instances.
            captcha_lock = threading.Lock()

            self.logger.info('''
                Going to scrape {num_keywords} keywords with {num_proxies}
                proxies by using {num_threads} threads.'''.format(
                    num_keywords=len(list(scrape_jobs)),
                    num_proxies=len(proxies),
                    num_threads=num_search_engines)
                )

            progress_thread = None

            # Show the progress of the scraping
            q = queue.Queue()
            progress_thread = ShowProgressQueue(config, q, len(scrape_jobs))
            progress_thread.start()

            workers = queue.Queue()
            num_worker = 0
            for search_engine in search_engines:

                for proxy in proxies:

                    for worker in range(num_workers):
                        num_worker += 1
                        workers.put(
                            ScrapeWorkerFactory(
                                config,
                                cache_manager=cache_manager,
                                mode=method,
                                proxy=proxy,
                                search_engine=search_engine,
                                session=session,
                                db_lock=db_lock,
                                cache_lock=cache_lock,
                                scraper_search=scraper_search,
                                captcha_lock=captcha_lock,
                                progress_queue=q,
                                browser_num=num_worker
                            )
                        )

            # here we look for suitable workers
            # for all jobs created.
            for job in scrape_jobs:
                while True:
                    worker = workers.get()
                    workers.put(worker)
                    if worker.is_suitabe(job):
                        worker.add_job(job)
                        break

            threads = []

            while not workers.empty():
                worker = workers.get()
                thread = worker.get_worker()
                if thread:
                    threads.append(thread)

            for t in threads:
                t.start()

            for t in threads:
                t.join()

            # after threads are done, stop the progress queue.
            q.put('done')
            progress_thread.join()

        result_writer.close_outfile()

        scraper_search.stopped_searching = datetime.datetime.utcnow()
        session.add(scraper_search)
        session.commit()

        if return_results:
            return scraper_search