Exemplo n.º 1
0
 def request_handler(self, connection, address):
     data = connection.recv(BUFFER_SIZE)
     self.logger.info("\n%s", data)
     request = Request.create_request(data)
     response = str(RequestHandler(request, self.document_root).response)
     connection.sendall(response)
     connection.close()
Exemplo n.º 2
0
 def __init__(self):
     # 初始化爬虫列表
     self.spiders = self.__init_spiders()
     # 初始化输出管理器
     self.op_manager = OutPutManager(self.spiders)
     # 初始化请求处理器
     self.request_handler = RequestHandler(self.spiders)
     # 初始化工作列表
     self.jobs = []
Exemplo n.º 3
0
    def __init__(self, dropbox, logall, mail, maxsize):
        self.dropbox = dropbox
        self.logall = logall
        self.sendmail = mail
        self.maxsize = maxsize

        if self.dropbox:
            assert secrets.PATH_IN_DB != ''
            self.save_path = secrets.PATH_IN_DB
            self.file_handler = DropboxSaver(self.save_path, secrets.DROPBOX_TOKEN)
        else:
            assert secrets.PATH != ''
            self.save_path = secrets.PATH
            self.file_handler = FileSaver(self.save_path)

        self.req = RequestHandler(secrets.USER, secrets.PASSWORD)
        self.file_handler.create_folder(CHLOG_FOLDER)
        self.database = Database(self.file_handler, self.dropbox)

        self.courses = secrets.COURSES
        self.removed_label_flag = False
        self.downloads = []
        self.changelog = []
Exemplo n.º 4
0
class Crawler:
    """A crawler for downloading university e-learning content."""
    def __init__(self, dropbox, logall, mail, maxsize):
        self.dropbox = dropbox
        self.logall = logall
        self.sendmail = mail
        self.maxsize = maxsize

        if self.dropbox:
            assert secrets.PATH_IN_DB != ''
            self.save_path = secrets.PATH_IN_DB
            self.file_handler = DropboxSaver(self.save_path, secrets.DROPBOX_TOKEN)
        else:
            assert secrets.PATH != ''
            self.save_path = secrets.PATH
            self.file_handler = FileSaver(self.save_path)

        self.req = RequestHandler(secrets.USER, secrets.PASSWORD)
        self.file_handler.create_folder(CHLOG_FOLDER)
        self.database = Database(self.file_handler, self.dropbox)

        self.courses = secrets.COURSES
        self.removed_label_flag = False
        self.downloads = []
        self.changelog = []

    def __str__(self):
        if not self.downloads:
            return 'Files were already up to date.'
        else:
            s = 's' if len(self.downloads) != 1 else ''
            d = 'DROPBOX/' if self.dropbox else ''
            p = d + self.save_path
            restr = '{} new file{} downloaded from ILIAS to {}.'.format(str(len(self.downloads)), s, p)
            return restr

    def run(self):
        """Main entry point.

        Authenticate the client, crawl the courses, persist the results,
        write a changelog and optionally send a mail with the results.
        """
        # authentication
        try:
            response = self.req.login()
            html_text = response.text
        except ConnectionError as err:
            print(err, 'A ConnectionError occurred. Please check your internet connection.', sep='\n')
            sys.exit(1)

        # check whether authentication worked; has to be done this way
        # since HTTP response on failed authentication is 200 - OK.
        auth_failed_msg = 'Anmeldedaten wurden nicht akzeptiert'
        if auth_failed_msg in html_text:
            print('Authorization failed. Please maintain user and password correctly.')
            sys.exit(1)

        # crawl courses
        self.crawl(html_text)

        # wrap up: close database, write changelog and send mail
        self.database.close(self.file_handler, self.dropbox)
        self.write_changelog()
        if self.sendmail and self.downloads:
            self.req.send_mail(self, self.downloads)

        # print download stats
        clrone = clr.BOLD
        clrtwo = clr.GREEN if self.downloads else clr.ENDC
        clrend = clr.ENDC
        print(clrone, clrtwo, self, clrend, sep='')

    def crawl(self, html_text):
        """Loop through top level courses and crawl the content for every course."""
        soup_courses = BeautifulSoup(html_text, 'html.parser')

        for soup_course in soup_courses.findAll('a', {'class': 'il_ContainerItemTitle'}):
            scs = soup_course.string
            course_name = util.course_contains(scs, self.courses)
            relative_link = soup_course.get('href')
            course_url = 'https://ilias.uni-mannheim.de/' + relative_link

            if course_name is not None:
                self.crawl_course(course_url, course_name + '/')
            else:
                print(clr.BOLD, 'No download requested for course >> ', clr.ENDC, scs.lstrip(), sep='')

    def crawl_course(self, course_url, folder_path):
        """Recursively call this method until there is something to download
           for this course in the respective path."""
        html_text_course = self.req.session.get(course_url).text
        soup_course = BeautifulSoup(html_text_course, 'html.parser')
        containers = soup_course.find_all('div', {'class': 'il_ContainerListItem'})

        if containers:
            if not self.removed_label_flag:
                util.print_method('folder_path', folder_path)

            for container in containers:
                file_ending = ''
                soup_line = container.find('a', {'class': 'il_ContainerItemTitle'})
                if soup_line:
                    link = soup_line.get('href')
                else:
                    continue
                item_properties = container.find('div', {'class': 'ilListItemSection il_ItemProperties'})
                if item_properties is not None:
                    item_prop = item_properties.find_all('span', {'class', 'il_ItemProperty'})
                    properties = [str(prop.string.strip()) for prop in item_prop if prop.string is not None]
                    if properties:
                        file_ending = properties[0]
                        last_update = properties[2]
                        # 22. May 2019, 14:15 ->2019-05-22 14:15:00
                        d = datetime.strptime(last_update, '%d. %b %Y, %H:%M')
                        # 201905221415
                        last_update = d.strftime('%Y%m%d%H%M')

                if 'download' in link:
                    self.file_handler.create_folder(folder_path)
                    self.check_save(folder_path, soup_line.string, file_ending, last_update, link)
                else:
                    parsed = util.remove_edge_characters(soup_line.string)
                    if not parsed:
                        self.removed_label_flag = True
                    self.crawl_course('https://ilias.uni-mannheim.de/' + link, folder_path + parsed)
        else:
            util.print_method('no_files_in', str(folder_path))

    def check_save(self, folder_path, filename, file_ending, last_update, url):
        """Prepare the file to be saved. Remove edge characters,
           trim and add the correct file ending."""
        # remove edge characters and trim
        filename = re.sub(r'[&]', 'and', filename)
        filename = re.sub(r'[!@#$/\:;*?<>|]', '', filename).strip()

        http = self.req.session.head(url, headers={'Accept-Encoding': 'identity'})
        file_size = http.headers['content-length']
        if not file_ending:
            file_ending = str(mimetypes.guess_extension(http.headers['content-type']))

        relative_file = folder_path + filename
        relative_path = relative_file + '.' + file_ending

        # for printing what is done with that file
        clrone = clr.ENDC
        clrtwo = clr.ENDC
        method = ''
        messag = relative_path

        # query db for path and update
        res_pu = self.database.get_name_update(relative_path, last_update)

        # example file sizes 2E8: 200.000.000 Bytes; 5E7: 30 MB
        if float(file_size) >= self.maxsize:  # Skip
            clrone = clr.BLUE
            method = 'file_skiped'
        # if db contains entry with path and update, file was already downloaded
        elif res_pu:  # exists
            method = 'loaded_once'
        else:
            # download file to compute hash
            content = self.req.session.get(url).content
            # compute content hash
            content_hash = hashlib.sha1(content).hexdigest()

            # query db for hash
            res_h = self.database.get_hash(content_hash)
            # filename or last update may have changed but hash exists
            # thus file is known and was already downloaded
            if res_h:  # exists
                method = 'loaded_once'
            else:
                # query db for name
                res_p = self.database.get_name(relative_path)
                # if this name already exists in the database
                # must be an update because otherwise the name + last_update
                # or the hash should have been in the db already
                if res_p:
                    method = 'file_update'
                    clrone = clr.GREEN
                    relative_path = '{}_UP{}.{}'.format(relative_file, content_hash[:4], file_ending)
                    messag = relative_path
                # not an update: new file
                else:
                    # check if this filename exists already at the destination path
                    # should not happen unless user renamed file to exactly this downloaded file name
                    # check also only exists to inform user that file is not just overwritten
                    # but safely moved to the .overwritten/ folder
                    exists = self.file_handler.exists(relative_path)
                    if not exists:
                        method = 'downloading'
                        clrone = clr.BOLD
                    else:
                        method = 'safe_overwr'
                        clrone = clr.RED
                    messag = relative_path + ' from ' + url
                saved = self.file_handler.save_file(relative_path, content)
                if saved:
                    self.database.insert(relative_path, content_hash, last_update)
                    self.downloads.append(method + ': ' + relative_path)

        if method != ('file_skiped' and 'loaded_once') or self.logall:
            self.changelog.append(str(method + ': ' + messag))

        util.print_method(method, messag, clrone, clrtwo)

    def write_changelog(self):
        """Write a changelog to /chosen_dir/.changelog/changelog_{datetime}."""
        if not self.changelog:
            return
        d = datetime.today().strftime('%Y-%m-%d_%H-%M-%S')
        tmp = '# Changelog from {}\n'.format(d)
        tmp += str(len(tmp) * '-') + '\n'
        tmp += '\n'.join(self.changelog) + '\n'
        b = tmp.encode('utf-8')
        self.file_handler.save_file(CHLOG_FOLDER + 'changelog_{}.txt'.format(d), b, True)
Exemplo n.º 5
0
class Core(object):

    def __init__(self):
        # 初始化爬虫列表
        self.spiders = self.__init_spiders()
        # 初始化输出管理器
        self.op_manager = OutPutManager(self.spiders)
        # 初始化请求处理器
        self.request_handler = RequestHandler(self.spiders)
        # 初始化工作列表
        self.jobs = []

    def __call__(self):
        """启动协程"""
        if getattr(setting, 'INIT_REQUESTS', getattr(defaultsettings, 'INIT_REQUESTS')):
            self.__init_requests()
        concurrency = getattr(setting, 'CONCURRENCY', getattr(defaultsettings, 'CONCURRENCY'))
        self.jobs = [gevent.spawn(self.__roll_request) for i in range(concurrency)]
        gevent.joinall(self.jobs)

    def __init_requests(self):
        for spider in self.spiders:
            for request in spider.start_requests():
                RequestQueue.push(request, spider)

    @staticmethod
    def __init_spiders():
        """实例化setting中指定的spiders"""
        def is_module(x):
            if x.endswith('.py') and not x.startswith('__'):
                return True
        spiders = []
        for module_file in filter(is_module, os.listdir('spiders')):
            module = importlib.import_module('spiders.' + '.'.join(module_file.split('.')[:-1]))
            for attr_name in dir(module):
                attr = getattr(module, attr_name)
                if type(attr) == type and issubclass(attr, Spider) and attr.__name__ in tools.get_conf('spiders'):
                    logger.debug('Create instances of spider<%s>' % attr.__name__)
                    spiders.append(attr())
        return spiders

    def __save_mode(self, request, err):
        """未知异常处理,安全模式下保护进程,调试模式下退出进程"""
        logger.error('Unexpected error happen when crawling<%s>, reason: %s: %s', request.url, type(err), err)
        if not tools.get_conf('SAFE_MODE'):
            gevent.killall(self.jobs)

    def __throw_request(self, request, spider):
        """将请求抛给请求处理器(ReuqestHandler)来处理, 当获取响应失败时抛出NoResponseError, 遇到未知类型错误将根据SAVE_MODE来决定是否退出进程"""
        try:
            return self.request_handler.handle_request(request, spider)
        except NoResponseError:
            raise
        except Exception as err:
            self.__save_mode(request, err)
            time.sleep(60)

    def __parse_response(self, response, spider):
        """处理响应,将新的请求放入请求队列,提取到的item抛给输入管理器处理"""
        try:
            callback = getattr(spider, response.request.callback)
            for each in callback(response):
                if isinstance(each, req.Request):  # 将新产生的请求添加至队列
                    RequestQueue.push(each, spider)
                elif isinstance(each, dict):  # 将提取到的item抛给op_manager进行处理
                    self.op_manager(each, spider)
        except Exception as err:
            self.__save_mode(response.request, err)

    def __roll_request(self):
        """不断从请求列表中提取请求进行处理(发送请求,解析响应)"""
        while 1:
            try:
                request = RequestQueue.pop()
            except EmptyError:
                # 请求队列为空, 退出程序
                gevent.killall(self.jobs)
            try:
                # 递出请求
                response = self.__throw_request(request, request.spider)
            except NoResponseError as err:
                logger.debug('No response, reason: %s', err)
                if 'duplicate url' in err.message:
                    RequestQueue.del_requesting(request)
            else:
                # 解析响应
                self.__parse_response(response, request.spider)
                RequestQueue.del_requesting(request)

    def start_core(self):
        """启动核心驱动"""
        # 增加redis的连接数
        redis_conn.setnx('connect_clients', 0)
        redis_conn.incr('connect_clients')
        self.init_start_requests()
        self.jobs = [gevent.spawn(self.roll_request) for i in range(getattr(setting, 'CONCURRENCY', 10))]
        gevent.joinall(self.jobs)
Exemplo n.º 6
0
 def __init__(self, config):
     """Takes a configReader.Config object as an argument."""
     self.request = RequestHandler(config)
     self.config = config
Exemplo n.º 7
0
class API:

    def __init__(self, config):
        """Takes a configReader.Config object as an argument."""
        self.request = RequestHandler(config)
        self.config = config

    def getItemsById(self, ids=[]):
        """Returns an item collection. Takes a list of ids as an argument."""
        ids = [str(x) for x in ids]
        resp = self.request.get({"action":"wbgetentities", "ids": "|".join(ids)})
        items = self._createItemCollection(resp["entities"])
        return items

    def getItemById(self, iid):
        """Returns the item defined by the argument."""
        return self.getItemsById([iid])[0]

    def getItemsByInterwiki(self, arg1=[], arg2=[]):
        """Returns an item collection, coming from either a list of sites as the first argument and a list of titles as the second argument, or a list of tuples (site, title) as the only argument."""
        if arg1 and not arg2: # then arg1 is [[site,title],[site,title]]
            sites = [x[0] for x in arg1]
            titles = [x[1] for x in arg1]
        else:
            sites = arg1
            titles = arg2
        resp = self.request.get({"action":"wbgetentities", "sites": "|".join(sites), "titles": "|".join(titles)})
        items = self._createItemCollection(resp["entities"])
        return items

    def getItemByInterwiki(self, site, title):
        """Returns an item which has the requested site and title."""
        return self.getItemsByInterwiki([site], [title])[0]

    def save(self, items, comment=None):
        """Saves a list of items or a single item, with an optional second parameter being the summary."""
        if type(items) != list:
            items = [items]
        for item in items:
            params = {"action":"wbeditentity"}
            if item.id:
                params["id"] = "q" + str(item.id)
            if comment:
                params["summary"] = comment
            if self.config["botflag"]:
                params["bot"] = "1"
            data = { "sitelinks": item.sitelinks.export(),
                     "aliases": item.aliases.export(),
                     "labels": item.labels.export(),
                     "descriptions": item.descriptions.export()
                   }

            params["data"] = json.dumps(data, ensure_ascii=False)
            self.request.postWithToken(params)

    def _createItemCollection(self, data):
        items = []
        for item in data:
            i = self._createItem(data[item])
            items.append(i)
        return items

    def _createItem(self, item, target=None):
        if not "sitelinks" in item:
            item["sitelinks"] = {}
        if not "aliases" in item:
            item["aliases"] = {}
        if not "labels" in item:
            item["labels"] = {}
        if not "descriptions" in item:
            item["descriptions"] = {}

        sitelinks = {}
        for x in item["sitelinks"]:
            sitelinks[x] = item["sitelinks"][x]["title"]
        aliases = {}
        for x in item["aliases"]:
            aliases[x] = [y["value"] for y in item["aliases"][x]]
        labels = {}
        for x in item["labels"]:
            labels[x] = item["labels"][x]["value"]
        descriptions = {}
        for x in item["descriptions"]:
            descriptions[x] = item["descriptions"][x]["value"]
        if target:
            target.sitelinks = sitelinks
            target.aliases = aliases
            target.labels = labels
            target.descriptions = descriptions
            if target.id and target.id != item["id"]:
                raise errors.ItemIDMismatch("Local item id does not match remote id. Have you added manually the id?")
            else:
                target.id = item["id"]
            return target
        else:
            i = Item(sitelinks, aliases, labels, descriptions)
            i.id = int(item["id"].lower().replace("q",""))
            return i
Exemplo n.º 8
0
 def __init__(self, config):
     """Takes a configReader.Config object as an argument."""
     self.request = RequestHandler(config)
     self.config = config
Exemplo n.º 9
0
class API:

    def __init__(self, config):
        """Takes a configReader.Config object as an argument."""
        self.request = RequestHandler(config)
        self.config = config

    def getItemsById(self, ids=[]):
        """Returns an item collection. Takes a list of ids as an argument."""
        ids = [str(x) for x in ids]
        resp = self.request.get({"action":"wbgetentities", "ids": "|".join(ids)})
        items = self._createItemCollection(resp["entities"])
        return items

    def getItemById(self, iid):
        """Returns the item defined by the argument."""
        return self.getItemsById([iid])[0]

    def getItemsByInterwiki(self, arg1=[], arg2=[]):
        """Returns an item collection, coming from either a list of sites as the first argument and a list of titles as the second argument, or a list of tuples (site, title) as the only argument."""
        if arg1 and not arg2: # then arg1 is [[site,title],[site,title]]
            sites = [x[0] for x in arg1]
            titles = [x[1] for x in arg1]
        else:
            sites = arg1
            titles = arg2
        resp = self.request.get({"action":"wbgetentities", "sites": "|".join(sites), "titles": "|".join(titles)})
        items = self._createItemCollection(resp["entities"])
        return items

    def getItemByInterwiki(self, site, title):
        """Returns an item which has the requested site and title."""
        return self.getItemsByInterwiki([site], [title])[0]

    def save(self, items, comment=None):
        """Saves a list of items or a single item, with an optional second parameter being the summary."""
        if type(items) != list:
            items = [items]
        for item in items:
            params = {"action":"wbsetitem"}
            if item.id:
                params["id"] = item.id
            if comment:
                params["summary"] = comment
            if self.config["botflag"]:
                params["bot"] = "1"
            data = { "sitelinks": item.sitelinks.export(),
                     "aliases": item.aliases.export(),
                     "labels": item.labels.export(),
                     "descriptions": item.descriptions.export()
                   }

            params["data"] = json.dumps(data, ensure_ascii=False)
            self.request.postWithToken(params)

    def _createItemCollection(self, data):
        items = []
        for item in data:
            i = self._createItem(data[item])
            items.append(i)
        return items

    def _createItem(self, item, target=None):
        if not "sitelinks" in item:
            item["sitelinks"] = {}
        if not "aliases" in item:
            item["aliases"] = {}
        if not "labels" in item:
            item["labels"] = {}
        if not "descriptions" in item:
            item["descriptions"] = {}

        sitelinks = {}
        for x in item["sitelinks"]:
            sitelinks[x] = item["sitelinks"][x]["title"]
        aliases = {}
        for x in item["aliases"]:
            aliases[x] = [y["value"] for y in item["aliases"][x]]
        labels = {}
        for x in item["labels"]:
            labels[x] = item["labels"][x]["value"]
        descriptions = {}
        for x in item["descriptions"]:
            descriptions[x] = item["descriptions"][x]["value"]
        if target:
            target.sitelinks = sitelinks
            target.aliases = aliases
            target.labels = labels
            target.descriptions = descriptions
            if target.id and target.id != item["id"]:
                raise errors.ItemIDMismatch("Local item id does not match remote id. Have you added manually the id?")
            else:
                target.id = item["id"]
            return target
        else:
            i = Item(sitelinks, aliases, labels, descriptions)
            i.id = int(item["id"].replace("q",""))
            return i