Пример #1
0
    def _persist(self):
        #persist filter of linker
        self.__filter.persist()

        #save configures of linker
        fconfigs = Helper.open(self.workdir(), self.__CONFIG_FILE_NAME, "w")
        json.dump(self.__configs.encode(), fconfigs)
        fconfigs.close()

        #save links of linker
        flinks = Helper.open(self.workdir(), self.__LINKS_FILE_NAME, "w")
        json.dump(self.__links.encode(), flinks)
        flinks.close()
Пример #2
0
    def _launch(self):
        #launch filter of linker
        self.__filter.launch()

        #load configures of linker
        fconfigs = Helper.open(self.workdir(), self.__CONFIG_FILE_NAME, "r")
        jsonobj = json.load(fconfigs)
        fconfigs.close()
        self.__configs.decode(jsonobj)

        #load links of linker
        flinks = Helper.open(self.workdir(), self.__LINKS_FILE_NAME, "r")
        jsonobj = json.load(flinks)
        flinks.close()
        self.__links.decode(jsonobj)
Пример #3
0
 def _key(self, uri):
     '''
         generate key for @uri
     :param uri: object, @Uri object
     :return: string, md5 of url
     '''
     return Helper.md5(uri.url())
Пример #4
0
 def launch(self):
     '''
         launch extractor
     :return:
     '''
     try:
         time_used, ret = Helper.timerun(self._launch)
         logger.info("extractor: launch extractor - %s, time used: %fs",
                     self.name(), time_used)
     except IOError, e:
         pass
Пример #5
0
    def update(self, uri, extras):
        '''
            udpate uri context with crawl response extras data
        :param uri: object, Uri object
        :param extras: dict, extras data for crawled response
        :return:
        '''
        time_used, ret = Helper.timerun(self._update, uri, extras)

        logger.info("linker: update link %s, updated. time used:%fs",
                    uri.url(), time_used)
Пример #6
0
 def shutdown(self):
     '''
         shutdown extractor
     :return:
     '''
     try:
         time_used, ret = Helper.timerun(self._shutdown)
         logger.info("extractor: shutdown extractor - %s, time used: %fs",
                     self.name(), time_used)
     except Exception, e:
         logger.info("extractor: shutdown extractor - %s, error: %s",
                     self.name(), e.message)
Пример #7
0
 def persist(self):
     '''
         persist extractor data
     :return:
     '''
     try:
         time_used, ret = Helper.timerun(self._persist)
         logger.info("extractor: persist extractor - %s, time used: %fs",
                     self.name(), time_used)
     except Exception, e:
         logger.info("extractor: persist extractor - %s, error: %s",
                     self.name(), e.message)
Пример #8
0
    def _parse(self, uri, content):
        # regex for parsing "img" tag's links
        regex = re.compile(r'<img.* src="([^"]+)"[^>]*>', re.IGNORECASE)

        # links parsed
        links = []

        # parse links from content
        urls = regex.findall(content)
        for url in urls:
            url = Helper.combine_path(uri.url(), url)
            links.append(Uri(url, uri.url()))

        return links
Пример #9
0
    def parse(self, uri, content):
        '''
            parse wrapper for actual @_parse method
        :param uri: object, uri for the @content
        :param content: string, content for the @url
        :return: list, list with @Uri objects
        '''
        if not self.accept(uri):
            return None

        time_used, links = Helper.timerun(self._parse, uri, content)
        logger.info(
            "parser: parse links: %s, parsed. links: %d, time used: %fs",
            uri.url(), len(links), time_used)

        return links
Пример #10
0
    def pull(self):
        '''
            pull next link from linker
        :return: object, Link object or None
        '''
        time_used, link = Helper.timerun(self._pull)

        if link is not None:
            logger.info("linker: pull link %s, pulled. time used: %fs",
                        link.uri().url(), time_used)
            return link.uri()
        else:
            logger.info(
                "linker: pull link none, no more links. time used: %fs",
                time_used)
            return None
Пример #11
0
    def extract(self, uri, content):
        '''
            extract data from content
        :param uri: object, @Uri object of content
        :param content: string, content of @uri
        :return: object, extract result object or None
        '''
        if not self.accept(uri):
            return None

        time_used, result = Helper.timerun(self._extract, uri, content)

        logger.info(
            "extractor: extract data from: %s, extracted. time used: %fs",
            uri.url(), time_used)

        return result
Пример #12
0
    def push(self, uri):
        '''
            push a uri to linker
        :param uri: object, Uri object
        :return: object, key of stored link
        '''
        if self.exists(uri):
            logger.info("linker: push link %s, exists.", uri.url())
            return

        if not self.accept(uri):
            logger.info("linker: push link %s, filtered.", uri.url())
            return

        time_used, ret = Helper.timerun(self._push, uri)

        logger.info("linker: push link %s, pushed. time used:%fs", uri.url(),
                    time_used)
Пример #13
0
 def _persist(self):
     file = Helper.open(self.workdir(), self.name(), "w")
     json.dump(self.__patterns, file)
     file.close()
Пример #14
0
    def _launch(self):
        file = Helper.open(self.workdir(), self.name(), "r")
        patterns = json.load(file)
        file.close()

        self._filter(*tuple(patterns))