class TorProxyMiddleware(object): '''This middleware enables Tor to serve as connection proxies''' def __init__(self, crawler: Crawler, max_count: int, allow_reuse_ip_after: int): '''Creates a new instance of TorProxyMiddleware Keywords arguments: max_count -- Maximum IP usage allow_reuse_ip_after -- When an IP can be reused ''' self.crawler = crawler self.max_count = max_count self.tor_ip_changer = TorIpChanger( reuse_threshold=allow_reuse_ip_after) self.tor_ip_changer.get_new_ip() self.items_scraped = 0 @classmethod def from_crawler(cls, crawler: Crawler): if not crawler.settings.getbool('TOR_IPROTATOR_ENABLED', False): raise NotConfigured() max_count = crawler.settings.getint('TOR_IPROTATOR_CHANGE_AFTER', 1000) allow_reuse_ip_after = crawler.settings.getint( 'TOR_IPROTATOR_ALLOW_REUSE_IP_AFTER', 10) mw = cls(crawler=crawler, max_count=max_count, allow_reuse_ip_after=allow_reuse_ip_after) return mw def process_request(self, request: Request, spider: Spider) -> None: if self.items_scraped >= self.max_count: spider.log('Changing Tor IP...') self.items_scraped = 0 self.crawler.engine.pause() new_ip = self.tor_ip_changer.get_new_ip() self.crawler.engine.unpause() if not new_ip: raise Exception('FatalError: Failed to find a new IP') spider.log(f'New Tor IP: {new_ip}') # http://127.0.0.1:8118 is the default address for Privoxy request.meta['proxy'] = 'http://127.0.0.1:8118' self.items_scraped += 1
def __init__(self, crawler: Crawler, max_count: int, allow_reuse_ip_after: int): '''Creates a new instance of TorProxyMiddleware Keywords arguments: max_count -- Maximum IP usage allow_reuse_ip_after -- When an IP can be reused ''' self.crawler = crawler self.max_count = max_count self.tor_ip_changer = TorIpChanger( reuse_threshold=allow_reuse_ip_after) self.tor_ip_changer.get_new_ip() self.items_scraped = 0
def __setup_ip_rotation(self, antiblock_config: Dict[str, Any] = {}): """ Setup the configurations for the ip rotation """ rot_type = antiblock_config["iprotator_type"] self.ip_rotation_type = rot_type if rot_type == 'tor': self.ip_change_after = antiblock_config\ .get('tor_iprotator_change_after', 1) self.ip_reuse_after = antiblock_config\ .get('tor_iprotator_allow_reuse_ip_after', 10) self.__validate_ip_tor_config() self.tor_controller = TorIpChanger( reuse_threshold=self.ip_reuse_after) self.tor_controller.get_new_ip() elif rot_type == 'proxy': self.proxy_list = antiblock_config.get('iprotator_proxy_list', []) self.__validate_ip_proxy_config() else: raise ValueError('Invalid ip rotation type: ' + rot_type)
from flask import jsonify from toripchanger import TorIpChanger from scrapemeagain.config import Config from scrapemeagain.dockerized.utils import app_factory app = app_factory(__name__) # Global IP store (using only specific `TorIpChanger` functionality). IPSTORE = TorIpChanger(reuse_threshold=Config.IPSTORE_REUSE_THRESHOLD) @app.route("/ip-is-safe/<ip>/") def ip_is_safe(ip): safe = IPSTORE._ip_is_safe(ip) if safe: IPSTORE._manage_used_ips(ip) return jsonify({"safe": safe}) if __name__ == "__main__": app.run(host="0.0.0.0", port=Config.IPSTORE_PORT)
month=random.randint(1, 12), day=random.randint(1, 28)) delta = datetime.timedelta(days=random.randint(8, 260)) beign_date = end_date - delta timeframe = f"{beign_date.strftime('%Y-%m-%d')} {end_date.strftime('%Y-%m-%d')}" return timeframe while True: loc = random.choice(geo_list) search = random.choice(kw) try: current_ip = tor_ip_changer.get_new_ip() except: pass pytrends = TrendReq() print(loc) print(search) pytrends.build_payload(search, cat=0, timeframe=random_timeframe(), geo=loc) df = pytrends.interest_over_time() if __name__ == "__main__": tor_ip_changer = TorIpChanger(tor_password='******', tor_port=9051, local_http_proxy='127.0.0.1:8118') random_query()
# author = 'BlackSesion' import base64 import json import random import re import traceback import urllib import urllib2 import sys from scrapy.exceptions import IgnoreRequest from scrapy.conf import settings from toripchanger import TorIpChanger # A Tor IP will be reused only after 10 different IPs were used. ip_changer = TorIpChanger(reuse_threshold=10) class RandomUserAgentMiddleware(object): def process_request(self, request, spider): ua = random.choice(settings.get('USER_AGENT_LIST')) if ua: request.headers.setdefault('User-Agent', ua) class ProxyMiddleware(object): # overwrite process request _requests_count = 0 _requests_count_x_ip = 10 def process_request(self, request, spider):
from scrapemeagain.config import Config from scrapemeagain.databaser import Databaser from scrapemeagain.scrapers.examplescraper2.custom_pipeline import ( ExhaustApiLimitPipeLine, ) # noqa from scrapemeagain.scrapers.examplescraper2.scraper import ExampleScraper2 from scrapemeagain.utils import services from scrapemeagain.utils.logger import setup_logging from scrapemeagain.utils.useragents import get_user_agents # Configure TorIpChanger. tor_ip_changer = TorIpChanger( reuse_threshold=0, # We need to remember all exhausted IPs. local_http_proxy=Config.LOCAL_HTTP_PROXY, tor_password=Config.TOR_PASSWORD, tor_port=Config.TOR_PORT, new_ip_max_attempts=Config.NEW_IP_MAX_ATTEMPTS, ) # Configure useragents. Config.USER_AGENTS = get_user_agents() # Configure logging. setup_logging(logger_name="example-scraper2") # Prepare the scraping pipeline. scraper = ExampleScraper2() databaser = Databaser(scraper.db_file, scraper.db_table) pipeline = ExhaustApiLimitPipeLine(scraper, databaser, tor_ip_changer)
def set_new_ip(self): return TorIpChanger( reuse_threshold=0, tor_password='******', tor_port=9051, local_http_proxy=self.settings.get('HTTP_PROXY')).get_new_ip()
from stem.util.log import get_logger # logger = get_logger() # logger.propagate = False # Default settings. REUSE_THRESHOLD = 1 LOCAL_HTTP_PROXY = "127.0.0.1:8118" NEW_IP_MAX_ATTEMPTS = 10 TOR_PASSWORD = "******" TOR_ADDRESS = "127.0.0.1" TOR_PORT = 9051 POST_NEW_IP_SLEEP = 0.5 ip_changer = TorIpChanger(reuse_threshold=REUSE_THRESHOLD, tor_password=TOR_PASSWORD, tor_port=TOR_PORT, local_http_proxy=LOCAL_HTTP_PROXY) # Send "Change IP" signal to tor control port class TorMiddleware(object): def __init__(self): self.settings = get_project_settings() self._requests_count = 0 self.controller = Controller.from_port(address=TOR_ADDRESS, port=TOR_PORT) self.controller.authenticate(password=TOR_PASSWORD) def set_new_ip(self): return TorIpChanger( reuse_threshold=0,
from scrapemeagain.config import Config from scrapemeagain.databaser import Databaser from scrapemeagain.pipeline import Pipeline from scrapemeagain.utils import services from scrapemeagain.utils.logger import setup_logging from scrapemeagain.utils.useragents import get_user_agents from examplescraper.scraper import ExampleScraper # Configure TorIpChanger. tor_ip_changer = TorIpChanger( reuse_threshold=Config.REUSE_THRESHOLD, local_http_proxy=Config.LOCAL_HTTP_PROXY, tor_password=Config.TOR_PASSWORD, tor_port=Config.TOR_PORT, new_ip_max_attempts=Config.NEW_IP_MAX_ATTEMPTS, ) # Configure useragents. Config.USER_AGENTS = get_user_agents(__file__) # Configure logging. setup_logging(logger_name="example-scraper") # Prepare the scraping pipeline. scraper = ExampleScraper() databaser = Databaser(scraper.db_file, scraper.db_table) pipeline = Pipeline(scraper, databaser, tor_ip_changer)
#https://stackoverflow.com/questions/43942689/error-while-receiving-a-control-message-socketclosed-empty-socket-content-i from stem.util.log import get_logger logger = get_logger() logger.propagate = False # https://stackoverflow.com/questions/45009940/scrapy-with-privoxy-and-tor-how-to-renew-ip/45010141 from toripchanger import TorIpChanger from stem import Signal from stem.control import Controller # password handling import os from dotenv import load_dotenv # TOR TOR_PASSWORD = os.getenv('TOR_PASS') # A Tor IP will be reused only after 10 different IPs were used. ip_changer = TorIpChanger(tor_password=TOR_PASSWORD, reuse_threshold=10) class ProxyMiddleware(object): """ learning about to learn about TOR https://github.com/WiliTest/Anonymous-scrapping-Scrapy-Tor-Privoxy-UserAgent # setting TOR for the first time on Linux https://jarroba.com/anonymous-scraping-by-tor-network/ config the to for the first time https://2019.www.torproject.org/docs/faq#torrc about TorIpChanger https://gist.github.com/DusanMadar/8d11026b7ce0bce6a67f7dd87b999f6b """ _requests_count = 0
# Python powered way to get a unique Tor IP # Docs: https://pypi.org/project/toripchanger/ # pip install toripchanger # Basic example from toripchanger import TorIpChanger # Tor IP reuse is prohibited. tor_ip_changer_0 = TorIpChanger(reuse_threshold=0) current_ip = tor_ip_changer_0.get_new_ip() # Current Tor IP address can be reused after one other IP was used (default setting). tor_ip_changer_1 = TorIpChanger(local_http_proxy='127.0.0.1:8888') current_ip = tor_ip_changer_1.get_new_ip() # Current Tor IP address can be reused after 5 other Tor IPs were used. tor_ip_changer_5 = TorIpChanger(tor_address="localhost", reuse_threshold=5) current_ip = tor_ip_changer_5.get_new_ip()
class AntiblockDriver(): """ General implementation for anti-blocking procedures. The _send_request method should be used by subclasses to send a request with anti-blocking mechanisms in place. The other methods can be used for cases that require more flexibility. """ def __validate_user_agent_config(self): """ Validate the user-agent configurations, raising an error if necessary """ # Validate the list of user-agents if not isinstance(self.user_agent_list, list) or \ len(self.user_agent_list) == 0: raise ValueError( ('If user-agent rotation in enabled, a ' 'non-empty list of user-agents must be supplied.')) # Validate the minimum UA usage if not isinstance(self.ua_rotate_min_usage, int) or \ self.ua_rotate_min_usage <= 0: raise TypeError(('The minimum user-agent usage should be a ' 'positive integer')) # Validate the maximum UA usage if not isinstance(self.ua_rotate_max_usage, int) or \ self.ua_rotate_max_usage <= 0: raise TypeError(('The maximum user-agent usage should be a ' 'positive integer')) # Validate the overall range of possible UA usage values if self.ua_rotate_min_usage > self.ua_rotate_max_usage: raise ValueError('The maximum user-agent usage should be ' 'greater than the minimum usage.') def __validate_delay_config(self): """ Validate the delay configurations, raising an error if necessary """ if not isinstance(self.download_delay, (int, float)) or \ self.download_delay < 0: raise ValueError('The download delay should be a positive number.') def __validate_autothrottle_config(self): """ Validate the autothrottle configurations, raising an error if necessary """ if not isinstance(self.at_start_delay, (int, float)) or \ self.at_start_delay < 0: raise ValueError('The autothrottle start delay should be a ' 'positive number.') if not isinstance(self.at_max_delay, (int, float)) or \ self.at_max_delay < 0: raise ValueError('The autothrottle maximum delay should be a ' 'positive number.') def __validate_ip_tor_config(self): """ Validate the ip rotation configurations when using tor, raising an error if necessary """ if not isinstance(self.ip_change_after, int) or \ self.ip_change_after < 0: raise ValueError('The number of times an IP can be used in ' 'succession should be a positive integer.') if not isinstance(self.ip_reuse_after, int) or self.ip_reuse_after < 0: raise ValueError('The number of different IPs to be used before ' 'repeating should be a positive number.') def __validate_ip_proxy_config(self): """ Validate the ip rotation configurations when using proxies, raising an error if necessary """ if not isinstance(self.proxy_list, list): raise ValueError('A valid list of proxies must be supplied.') def __validate_cookie_config(self): """ Validate the cookie injection configurations, raising an error if necessary """ if not isinstance(self.cookies, list): raise ValueError('A valid list of cookies must be supplied.') def __setup_ip_rotation(self, antiblock_config: Dict[str, Any] = {}): """ Setup the configurations for the ip rotation """ rot_type = antiblock_config["iprotator_type"] self.ip_rotation_type = rot_type if rot_type == 'tor': self.ip_change_after = antiblock_config\ .get('tor_iprotator_change_after', 1) self.ip_reuse_after = antiblock_config\ .get('tor_iprotator_allow_reuse_ip_after', 10) self.__validate_ip_tor_config() self.tor_controller = TorIpChanger( reuse_threshold=self.ip_reuse_after) self.tor_controller.get_new_ip() elif rot_type == 'proxy': self.proxy_list = antiblock_config.get('iprotator_proxy_list', []) self.__validate_ip_proxy_config() else: raise ValueError('Invalid ip rotation type: ' + rot_type) def __init__(self, antiblock_config: Dict[str, Any] = {}): """ Constructor for the generic antiblock driver. :param antiblock_config: Dictionary of configuration parameters for the antiblock measures """ self.ua_items_scraped = 0 self.ip_items_scraped = 0 self.ua_rotate = antiblock_config\ .get('rotate_user_agent_enabled', False) if self.ua_rotate: self.user_agent_list = antiblock_config.get('user_agents', []) self.ua_rotate_min_usage = antiblock_config\ .get('min_user_agent_usage', 1) self.ua_rotate_max_usage = antiblock_config\ .get('max_user_agent_usage', self.ua_rotate_min_usage) self.ua_rotate_limit_usage = random\ .randint(self.ua_rotate_min_usage, self.ua_rotate_max_usage) self.__validate_user_agent_config() self.user_agents = cycle(self.user_agent_list) self.user_agent = next(self.user_agents) self.time_last_request = None self.current_delay = None self.download_delay = antiblock_config.get('download_delay', 0.25) self.randomize_delay = antiblock_config\ .get('download_delay_randomize', True) self.__validate_delay_config() self.at_enabled = antiblock_config.get('autothrottle_enabled', False) if self.at_enabled: self.at_start_delay = antiblock_config\ .get('autothrottle_start_delay', 5) self.at_max_delay = antiblock_config\ .get('autothrottle_max_delay', 60) self.__validate_autothrottle_config() self.current_delay = 0 self.ip_rotate = antiblock_config.get('iprotator_enabled', False) if self.ip_rotate: self.__setup_ip_rotation(antiblock_config) self.insert_cookies = antiblock_config.get('insert_cookies', False) if self.insert_cookies: self.cookies = antiblock_config.get('cookies', []) self.__validate_cookie_config() def _generate_next_delay(self, response_latency: float = 0, last_status: int = 0): """ Generates the value for the delay to be applied before doing the next request. :param response_latency: time taken by the last request in seconds :param last_status: HTTP status received from the last request """ if self.at_enabled: # Autothrottle if self.current_delay is None or self.time_last_request is None: self.current_delay = self.at_start_delay else: next_delay = (response_latency + self.current_delay) / 2 # Non-200 responses can't decrease the delay if last_status == 200 or next_delay > self.current_delay: # Clamp delay between values supplied by the user min_delay = self.download_delay max_delay = self.at_max_delay clamped = max(min_delay, min(max_delay, next_delay)) self.current_delay = clamped else: # Normal delay if self.randomize_delay: self.current_delay = self.download_delay * \ random.uniform(0.5, 1.5) else: self.current_delay = self.download_delay def _get_current_user_agent(self) -> Optional[str]: """ Get the current user agent to use, and apply the rotation if necessary :returns: A string representing the user-agent to use for the next request, or None if user-agent rotation is disabled """ if self.ua_rotate: if self.ua_items_scraped >= self.ua_rotate_limit_usage: self.ua_items_scraped = 0 self.ua_rotate_limit_usage = random.randint( self.ua_rotate_min_usage, self.ua_rotate_max_usage) self.user_agent = next(self.user_agents) self.ua_items_scraped += 1 return self.user_agent else: return None def _apply_delay(self): """ Wait for the configured amount of time, previously calculated by the _generate_next_delay method. """ last_req = self.time_last_request elapsed = None if last_req is None: elapsed = self.current_delay else: elapsed = time.perf_counter() - self.time_last_request if self.time_last_request is None or elapsed < self.current_delay: # Wait for the remaining time remaining = self.current_delay - elapsed time.sleep(remaining) def _generate_headers(self, headers: Dict[str, Any] = {}): """ Generate the headers for the next request, with the correct user-agent value. :param headers: Dictionary of extra values to be included in the header :returns: The headers for the next request """ user_agent = self._get_current_user_agent() if self.ua_rotate and user_agent is not None: headers['User-Agent'] = user_agent if not bool(headers): headers = None return headers def _generate_proxies(self, proxies: Dict[str, Any] = {}): """ Generate the proxies for the next request, considering the given list or the Tor configuration, if supplied. :param proxies: Dictionary of possible default values for the proxies :returns: The proxies to be used by the next request """ if self.ip_rotate: if self.ip_rotation_type == 'tor': if self.ip_items_scraped >= self.ip_change_after: logging.info('Changing Tor IP...') self.ip_items_scraped = 0 new_ip = self.tor_controller.get_new_ip() if not new_ip: raise Exception('FatalError: Failed to find a new IP') logging.info(f'New Tor IP: {new_ip}') proxies = {'http': '127.0.0.1:8118', 'https': '127.0.0.1:8118'} elif self.ip_rotation_type == 'proxy': proxy_len = len(self.proxy_list) proxies = { 'http': self.proxy_list[self.ip_items_scraped % proxy_len], 'https': self.proxy_list[self.ip_items_scraped % proxy_len] } self.ip_items_scraped += 1 return proxies if bool(proxies) else None def _generate_cookies(self, cookies: Dict[str, Any] = {}): """ Generate the cookies for the next request. :param cookies: Dictionary of extra cookies to be included :returns: The cookies to be sent by the next request """ if self.insert_cookies: for x in self.cookies: cookies = {**cookies, **x} return cookies if bool(cookies) else None def _send_request(self, req_function: Callable, *args, **kwargs) -> Any: """ Apply all configured anti-blocking mechanisms and call the request function supplied. :param req_function: The function to be called to actually send the request. It should take at least three named arguments: headers, proxies and cookies, which represent the respective values to be inserted. Any extra values passed to this method are redirected to the req_function. :returns: The response received from the supplied function """ headers = self._generate_headers(kwargs.get('headers', {})) if 'headers' in kwargs: del kwargs['headers'] proxies = self._generate_proxies(kwargs.get('proxies', {})) if 'proxies' in kwargs: del kwargs['proxies'] cookies = self._generate_cookies(kwargs.get('cookies', {})) if 'cookies' in kwargs: del kwargs['cookies'] self._apply_delay() response = req_function(headers=headers, proxies=proxies, cookies=cookies, *args, **kwargs) # Calculate next delay value self._generate_next_delay(response.elapsed.total_seconds(), response.status_code) self.time_last_request = time.perf_counter() return response
# - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) # A Tor IP will be reused only after 10 different IPs were used. ip_changer = TorIpChanger(tor_password='******', tor_port=9051, reuse_threshold=300) class ProxyMiddleware(object): def process_request(self, request, spider): ip_changer.get_new_ip() request.meta['proxy'] = 'http://127.0.0.1:8118' spider.log('Proxy : %s' % request.meta['proxy'])
import time from toripchanger import TorIpChanger ip_changer = TorIpChanger(tor_password='******', tor_port=9051, local_http_proxy='127.0.0.1:8118') for i in range(10): new_ip = ip_changer.get_new_ip() print("New IP: " + new_ip) time.sleep(5)