def __init__(self, crawler: Crawler, max_count: int, allow_reuse_ip_after: int): '''Creates a new instance of TorProxyMiddleware Keywords arguments: max_count -- Maximum IP usage allow_reuse_ip_after -- When an IP can be reused ''' self.crawler = crawler self.max_count = max_count self.tor_ip_changer = TorIpChanger( reuse_threshold=allow_reuse_ip_after) self.tor_ip_changer.get_new_ip() self.items_scraped = 0
def __setup_ip_rotation(self, antiblock_config: Dict[str, Any] = {}): """ Setup the configurations for the ip rotation """ rot_type = antiblock_config["iprotator_type"] self.ip_rotation_type = rot_type if rot_type == 'tor': self.ip_change_after = antiblock_config\ .get('tor_iprotator_change_after', 1) self.ip_reuse_after = antiblock_config\ .get('tor_iprotator_allow_reuse_ip_after', 10) self.__validate_ip_tor_config() self.tor_controller = TorIpChanger( reuse_threshold=self.ip_reuse_after) self.tor_controller.get_new_ip() elif rot_type == 'proxy': self.proxy_list = antiblock_config.get('iprotator_proxy_list', []) self.__validate_ip_proxy_config() else: raise ValueError('Invalid ip rotation type: ' + rot_type)
from flask import jsonify from toripchanger import TorIpChanger from scrapemeagain.config import Config from scrapemeagain.dockerized.utils import app_factory app = app_factory(__name__) # Global IP store (using only specific `TorIpChanger` functionality). IPSTORE = TorIpChanger(reuse_threshold=Config.IPSTORE_REUSE_THRESHOLD) @app.route("/ip-is-safe/<ip>/") def ip_is_safe(ip): safe = IPSTORE._ip_is_safe(ip) if safe: IPSTORE._manage_used_ips(ip) return jsonify({"safe": safe}) if __name__ == "__main__": app.run(host="0.0.0.0", port=Config.IPSTORE_PORT)
month=random.randint(1, 12), day=random.randint(1, 28)) delta = datetime.timedelta(days=random.randint(8, 260)) beign_date = end_date - delta timeframe = f"{beign_date.strftime('%Y-%m-%d')} {end_date.strftime('%Y-%m-%d')}" return timeframe while True: loc = random.choice(geo_list) search = random.choice(kw) try: current_ip = tor_ip_changer.get_new_ip() except: pass pytrends = TrendReq() print(loc) print(search) pytrends.build_payload(search, cat=0, timeframe=random_timeframe(), geo=loc) df = pytrends.interest_over_time() if __name__ == "__main__": tor_ip_changer = TorIpChanger(tor_password='******', tor_port=9051, local_http_proxy='127.0.0.1:8118') random_query()
# author = 'BlackSesion' import base64 import json import random import re import traceback import urllib import urllib2 import sys from scrapy.exceptions import IgnoreRequest from scrapy.conf import settings from toripchanger import TorIpChanger # A Tor IP will be reused only after 10 different IPs were used. ip_changer = TorIpChanger(reuse_threshold=10) class RandomUserAgentMiddleware(object): def process_request(self, request, spider): ua = random.choice(settings.get('USER_AGENT_LIST')) if ua: request.headers.setdefault('User-Agent', ua) class ProxyMiddleware(object): # overwrite process request _requests_count = 0 _requests_count_x_ip = 10 def process_request(self, request, spider):
from scrapemeagain.config import Config from scrapemeagain.databaser import Databaser from scrapemeagain.scrapers.examplescraper2.custom_pipeline import ( ExhaustApiLimitPipeLine, ) # noqa from scrapemeagain.scrapers.examplescraper2.scraper import ExampleScraper2 from scrapemeagain.utils import services from scrapemeagain.utils.logger import setup_logging from scrapemeagain.utils.useragents import get_user_agents # Configure TorIpChanger. tor_ip_changer = TorIpChanger( reuse_threshold=0, # We need to remember all exhausted IPs. local_http_proxy=Config.LOCAL_HTTP_PROXY, tor_password=Config.TOR_PASSWORD, tor_port=Config.TOR_PORT, new_ip_max_attempts=Config.NEW_IP_MAX_ATTEMPTS, ) # Configure useragents. Config.USER_AGENTS = get_user_agents() # Configure logging. setup_logging(logger_name="example-scraper2") # Prepare the scraping pipeline. scraper = ExampleScraper2() databaser = Databaser(scraper.db_file, scraper.db_table) pipeline = ExhaustApiLimitPipeLine(scraper, databaser, tor_ip_changer)
def set_new_ip(self): return TorIpChanger( reuse_threshold=0, tor_password='******', tor_port=9051, local_http_proxy=self.settings.get('HTTP_PROXY')).get_new_ip()
from stem.util.log import get_logger # logger = get_logger() # logger.propagate = False # Default settings. REUSE_THRESHOLD = 1 LOCAL_HTTP_PROXY = "127.0.0.1:8118" NEW_IP_MAX_ATTEMPTS = 10 TOR_PASSWORD = "******" TOR_ADDRESS = "127.0.0.1" TOR_PORT = 9051 POST_NEW_IP_SLEEP = 0.5 ip_changer = TorIpChanger(reuse_threshold=REUSE_THRESHOLD, tor_password=TOR_PASSWORD, tor_port=TOR_PORT, local_http_proxy=LOCAL_HTTP_PROXY) # Send "Change IP" signal to tor control port class TorMiddleware(object): def __init__(self): self.settings = get_project_settings() self._requests_count = 0 self.controller = Controller.from_port(address=TOR_ADDRESS, port=TOR_PORT) self.controller.authenticate(password=TOR_PASSWORD) def set_new_ip(self): return TorIpChanger( reuse_threshold=0,
from scrapemeagain.config import Config from scrapemeagain.databaser import Databaser from scrapemeagain.pipeline import Pipeline from scrapemeagain.utils import services from scrapemeagain.utils.logger import setup_logging from scrapemeagain.utils.useragents import get_user_agents from examplescraper.scraper import ExampleScraper # Configure TorIpChanger. tor_ip_changer = TorIpChanger( reuse_threshold=Config.REUSE_THRESHOLD, local_http_proxy=Config.LOCAL_HTTP_PROXY, tor_password=Config.TOR_PASSWORD, tor_port=Config.TOR_PORT, new_ip_max_attempts=Config.NEW_IP_MAX_ATTEMPTS, ) # Configure useragents. Config.USER_AGENTS = get_user_agents(__file__) # Configure logging. setup_logging(logger_name="example-scraper") # Prepare the scraping pipeline. scraper = ExampleScraper() databaser = Databaser(scraper.db_file, scraper.db_table) pipeline = Pipeline(scraper, databaser, tor_ip_changer)
#https://stackoverflow.com/questions/43942689/error-while-receiving-a-control-message-socketclosed-empty-socket-content-i from stem.util.log import get_logger logger = get_logger() logger.propagate = False # https://stackoverflow.com/questions/45009940/scrapy-with-privoxy-and-tor-how-to-renew-ip/45010141 from toripchanger import TorIpChanger from stem import Signal from stem.control import Controller # password handling import os from dotenv import load_dotenv # TOR TOR_PASSWORD = os.getenv('TOR_PASS') # A Tor IP will be reused only after 10 different IPs were used. ip_changer = TorIpChanger(tor_password=TOR_PASSWORD, reuse_threshold=10) class ProxyMiddleware(object): """ learning about to learn about TOR https://github.com/WiliTest/Anonymous-scrapping-Scrapy-Tor-Privoxy-UserAgent # setting TOR for the first time on Linux https://jarroba.com/anonymous-scraping-by-tor-network/ config the to for the first time https://2019.www.torproject.org/docs/faq#torrc about TorIpChanger https://gist.github.com/DusanMadar/8d11026b7ce0bce6a67f7dd87b999f6b """ _requests_count = 0
# Python powered way to get a unique Tor IP # Docs: https://pypi.org/project/toripchanger/ # pip install toripchanger # Basic example from toripchanger import TorIpChanger # Tor IP reuse is prohibited. tor_ip_changer_0 = TorIpChanger(reuse_threshold=0) current_ip = tor_ip_changer_0.get_new_ip() # Current Tor IP address can be reused after one other IP was used (default setting). tor_ip_changer_1 = TorIpChanger(local_http_proxy='127.0.0.1:8888') current_ip = tor_ip_changer_1.get_new_ip() # Current Tor IP address can be reused after 5 other Tor IPs were used. tor_ip_changer_5 = TorIpChanger(tor_address="localhost", reuse_threshold=5) current_ip = tor_ip_changer_5.get_new_ip()
# - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) # A Tor IP will be reused only after 10 different IPs were used. ip_changer = TorIpChanger(tor_password='******', tor_port=9051, reuse_threshold=300) class ProxyMiddleware(object): def process_request(self, request, spider): ip_changer.get_new_ip() request.meta['proxy'] = 'http://127.0.0.1:8118' spider.log('Proxy : %s' % request.meta['proxy'])