def start(self): from dirhunt.processors import get_processor, GenericProcessor, Error session = self.crawler.sessions.get_session() try: resp = session.get(self.url.url, stream=True, timeout=TIMEOUT, allow_redirects=False) except RequestException as e: self.crawler.results.put(Error(self, e)) self.close() return self self.set_type(resp.headers.get('Content-Type')) self.flags.add(str(resp.status_code)) text = '' soup = None if resp.status_code < 300 and self.maybe_directory(): text = resp.raw.read(MAX_RESPONSE_SIZE, decode_content=True) soup = BeautifulSoup(text, 'html.parser') if self.maybe_directory(): processor = get_processor(resp, text, self, soup) or GenericProcessor(resp, self) processor.process(text, soup) self.crawler.results.put(processor) self.flags.update(processor.flags) # TODO: Podemos fijarnos en el processor.index_file. Si existe y es un 200, entonces es que existe. if self.exists is None and resp.status_code < 404: self.exists = True self.add_self_directories(True if (not self.maybe_rewrite() and self.exists) else None, 'directory' if not self.maybe_rewrite() else None) self.close() return self
def start(self): from dirhunt.processors import get_processor, GenericProcessor, Error, ProcessIndexOfRequest session = self.crawler.sessions.get_session() try: resp = session.get(self.url.url, stream=True, verify=False, timeout=self.timeout, allow_redirects=False) except RequestException as e: self.crawler.current_processed_count += 1 self.crawler.results.put(Error(self, e)) self.close() return self self.set_type(resp.headers.get('Content-Type')) self.flags.add(str(resp.status_code)) text = '' soup = None processor = None if resp.status_code < 300 and self.must_be_downloaded(resp): try: text = resp.raw.read(MAX_RESPONSE_SIZE, decode_content=True) except (RequestException, ReadTimeoutError, socket.timeout) as e: self.crawler.current_processed_count += 1 self.crawler.results.put(Error(self, e)) self.close() return self soup = BeautifulSoup(text, 'html.parser') if resp.headers.get( 'Content-Type') == 'text/html' else None if self.must_be_downloaded(resp): processor = get_processor(resp, text, self, soup) or GenericProcessor(resp, self) processor.process(text, soup) self.flags.update(processor.flags) if self.maybe_directory(): self.crawler.results.put(processor) if processor is not None: self.processor_data = processor.json() if processor and isinstance(processor, ProcessIndexOfRequest): self.crawler.index_of_processors.append(processor) else: self.crawler.current_processed_count += 1 # TODO: Podemos fijarnos en el processor.index_file. Si existe y es un 200, entonces es que existe. if self.exists is None and resp.status_code < 404: self.exists = True self.add_self_directories( True if (not self.maybe_rewrite() and self.exists) else None, 'directory' if not self.maybe_rewrite() else None) self.close() return self