예제 #1
0
class Crawler:
    """Crawler"""
    def __init__(self) -> None:
        self.downloader = Downloader()
        self.parser = Parser()

    def scrape(self, keyword: Optional[str] = None) -> bool:
        """scrape

        Summary:
                summary

        Args:
            keyword: Optional[str]

        Returns:
                {variable}: bool

        Raises:

        """
        if keyword:
            html = self.downloader.get_text_from_keyword(keyword)
            if html:
                self.parser.print_result_from_parser(html)
                return True
        return False

    def change_url(self):
        """change_url"""
예제 #2
0
class DownloaderTestCase(unittest.TestCase):
    def setUp(self):
        self.downloader = Downloader()
        self.keyword = '手機'
        self.response = self.downloader.request(self.keyword)

    def test_request(self):

        self.assertEqual(200, self.response.status_code)

    def test_get_response_text(self):
        html = self.downloader.get_response_text(self.response)

        self.assertTrue(html)

    def test_get_text_from_keyword(self):
        html = self.downloader.get_text_from_keyword(self.keyword)

        self.assertTrue(html)
예제 #3
0
class Spider():
    def __init__(self):
        self.downloader = Downloader()
        self.processor = Processor()
        self.pipeline = Pipeline()
        self.schedule = Schedule()

    def set_downloader(self, downloader):
        self.downloader = downloader

    def set_processor(self, processor):
        self.processor = processor

    def set_pipeline(self, pipeline):
        self.pipeline = pipeline

    def set_schedule(self, schedule):
        self.schedule = schedule

    def run(self):
        while (True):
            request = self.schedule.pop()
            if request is None:
                print('等待 5 秒')
                time.sleep(5)
                continue
            print('pop url :' + request.url)
            page = self.downloader.download(request)
            if (not page.download_success()):
                self.schedule.push(request)
                continue
            self.processor.process(page)
            self.add_target_url(page)
            if (page.is_skip):
                continue
            # 持久化
            self.pipeline.data_persistent(page.get_item())
            print('等待 5 秒')
            time.sleep(5)

    def add_target_url(self, page):
        for request in page.get_target_requests():
            self.schedule.push(request)
            print('add target url : ' + request.url)

    def add_start_url(self, request):
        if (isinstance(request, Request)):
            self.schedule.push(request)
        elif (isinstance(request, str)):
            self.schedule.push(Request(request))
예제 #4
0
 def setUp(self):
     self.downloader = Downloader()
     self.keyword = '手機'
     self.response = self.downloader.request(self.keyword)
예제 #5
0
 def __init__(self, param: str, downloader=Downloader(), **kwargs):
     super().__init__(
         downloader,
         urljoin(config.get("default", "default_url"), param),
         name=kwargs["name"],
     )
예제 #6
0
    def crawl(self,
              download_searches=True,
              download_offers=True,
              remove_files=False,
              start_page=1,
              end_page=30,
              rent=True):
        if remove_files:
            log.info("Removing files")
            filer = Filer(self.download_path_offers)
            filer.empty_dir()

            filer = Filer(self.download_path_searches)
            filer.empty_dir()

        if download_searches:
            log.info("Downloading files")
            for service in self.services:
                d = Downloader(self.download_path_searches,
                               self.download_path_offers,
                               service=service,
                               city=self.city,
                               property_type=self.property_type,
                               rent=rent)
                d.download_main_pages(start_page, end_page)
        """
        Get all links to offers
        """
        filer_searches = Filer(self.download_path_searches)
        all_offers = {}

        for service in self.services:
            if service not in all_offers:
                all_offers[service] = []

            for file in filer_searches.get_all_files():
                with open("{}/{}".format(self.download_path_searches, file),
                          "r",
                          encoding="utf-8") as f:
                    scraper = Scraper(f.read(), service)
                    for link in scraper.get_search_results():
                        all_offers[service].append(link)

        log.debug("Links to offers:\t{}".format(len(all_offers)))
        """
        Download offers
        """
        if download_offers:
            for service in self.services:
                d = Downloader(self.download_path_searches,
                               self.download_path_offers,
                               service=service,
                               city=self.city)
                progress = 0
                for url in all_offers[service]:
                    if d.download_offer_page(url):
                        progress += 1
                    log.info("Downloaded [{}]:\t{}/{}".format(
                        service, progress, len(all_offers[service])))

        filer = Filer(self.download_path_offers)
        all_files = filer.get_all_files()
        """
        Add offers to DB
        """
        counter = 0
        for file in all_files:
            if counter % 50 == 0:  # commit records in DB every 50 offers
                session.commit()
            params = {}

            with open("{}/{}".format(self.download_path_offers, file),
                      "r",
                      encoding="utf-8") as f:
                if "gratka" in file:
                    log.debug("Parsing gratka file:\t{}".format(file))
                    parser = ParserGratka(f.read())
                    params = parser.parse_site()

                elif "otodom" in file:
                    log.debug("Parsing otodom file:\t{}".format(file))
                    parser = ParserOtodom(f.read())
                    params = parser.parse_site()

                if "offer_id" in params:
                    instance = session.query(Offer).filter(
                        Offer.offer_id == params['offer_id']).first()
                    if not instance:
                        log.info("Adding offer:\t{}\t{}/{}".format(
                            params['offer_id'], counter, len(all_files)))
                        session.add(Offer(**params))
            counter += 1
        session.commit()
예제 #7
0
 def __init__(self) -> None:
     self.downloader = Downloader()
     self.parser = Parser()
예제 #8
0
 def __init__(self):
     self.downloader = Downloader()
     self.processor = Processor()
     self.pipeline = Pipeline()
     self.schedule = Schedule()
예제 #9
0
    def crawl(self):
        for start_link in self.starting_urls:
            downloader = Downloader(start_link)
            app = downloader.get_app_from_link()

            self.links_visited.add(start_link)

            self.apps.append(app)
            self.depth_links.append(app.in_links)
            self.depth_links.append(app.out_links)
            with open(str('./resources/jsonFiles/' + 'item_pipeline_' + 0 + '_' + app.uid + '.json'), 'w') as outfile:
                json.dump(app.__dict__, outfile)
            self.num_docs_crawled = 1

        while self.num_docs_crawled < self.num_docs_to_be_crawled:
            current_in_links = []
            current_out_links = []

            count = 0
            for link in self.depth_links[self.current_depth]:


                if link not in self.links_visited and count < self.in_degree :
                    current_app = Downloader(link).get_app_from_link()
                    if current_app is 0:
                        continue
                    current_in_links.extend(current_app.in_links)
                    current_out_links.extend(current_app.out_links)
                    with open(str('./resources/jsonFiles/' + 'item_pipeline_' + str(self.num_docs_crawled) + '_' + current_app.uid + '.json'), 'w') as outfile:
                        json.dump(current_app.__dict__, outfile)
                    update_progress(self.num_docs_crawled, self.num_docs_to_be_crawled)
                    self.num_docs_crawled += 1
                    self.apps.append(current_app)
                    self.links_visited.add(link)
                    count += 1



            self.depth_links.append(current_in_links)
            self.depth_links.append(current_out_links)
            self.current_depth += 1

            current_in_links = []
            current_out_links = []

            count = 0
            for link in self.depth_links[self.current_depth]:
                if link not in self.links_visited and count < self.out_degree:
                    current_app = Downloader(link).get_app_from_link()
                    if current_app is 0:
                        continue
                    current_in_links.extend(current_app.in_links)
                    current_out_links.extend(current_app.out_links)
                    with open(str('./resources/jsonFiles/' + 'item_pipeline_' + str(self.num_docs_crawled) + '_' + current_app.uid + '.json'), 'w') as outfile:
                        json.dump(current_app.__dict__, outfile)
                    update_progress(self.num_docs_crawled, self.num_docs_to_be_crawled)
                    self.num_docs_crawled += 1
                    self.apps.append(current_app)
                    self.links_visited.add(link)
                    count += 1


            self.current_depth += 1
            self.depth_links.append(current_in_links)
            self.depth_links.append(current_out_links)