示例#1
0
    def __init__(self, crawler: Crawler, max_count: int,
                 allow_reuse_ip_after: int):
        '''Creates a new instance of TorProxyMiddleware
        
        Keywords arguments:
            max_count -- Maximum IP usage
            allow_reuse_ip_after -- When an IP can be reused
        '''

        self.crawler = crawler
        self.max_count = max_count

        self.tor_ip_changer = TorIpChanger(
            reuse_threshold=allow_reuse_ip_after)
        self.tor_ip_changer.get_new_ip()

        self.items_scraped = 0
示例#2
0
    def __setup_ip_rotation(self, antiblock_config: Dict[str, Any] = {}):
        """
        Setup the configurations for the ip rotation
        """

        rot_type = antiblock_config["iprotator_type"]
        self.ip_rotation_type = rot_type
        if rot_type == 'tor':
            self.ip_change_after = antiblock_config\
                .get('tor_iprotator_change_after', 1)
            self.ip_reuse_after = antiblock_config\
                .get('tor_iprotator_allow_reuse_ip_after', 10)
            self.__validate_ip_tor_config()

            self.tor_controller = TorIpChanger(
                reuse_threshold=self.ip_reuse_after)
            self.tor_controller.get_new_ip()
        elif rot_type == 'proxy':
            self.proxy_list = antiblock_config.get('iprotator_proxy_list', [])
            self.__validate_ip_proxy_config()
        else:
            raise ValueError('Invalid ip rotation type: ' + rot_type)
示例#3
0
from flask import jsonify
from toripchanger import TorIpChanger

from scrapemeagain.config import Config
from scrapemeagain.dockerized.utils import app_factory


app = app_factory(__name__)


# Global IP store (using only specific `TorIpChanger` functionality).
IPSTORE = TorIpChanger(reuse_threshold=Config.IPSTORE_REUSE_THRESHOLD)


@app.route("/ip-is-safe/<ip>/")
def ip_is_safe(ip):
    safe = IPSTORE._ip_is_safe(ip)
    if safe:
        IPSTORE._manage_used_ips(ip)

    return jsonify({"safe": safe})


if __name__ == "__main__":
    app.run(host="0.0.0.0", port=Config.IPSTORE_PORT)
                                 month=random.randint(1, 12),
                                 day=random.randint(1, 28))
        delta = datetime.timedelta(days=random.randint(8, 260))
        beign_date = end_date - delta
        timeframe = f"{beign_date.strftime('%Y-%m-%d')} {end_date.strftime('%Y-%m-%d')}"
        return timeframe

    while True:
        loc = random.choice(geo_list)
        search = random.choice(kw)
        try:
            current_ip = tor_ip_changer.get_new_ip()
        except:
            pass
        pytrends = TrendReq()
        print(loc)
        print(search)
        pytrends.build_payload(search,
                               cat=0,
                               timeframe=random_timeframe(),
                               geo=loc)
        df = pytrends.interest_over_time()


if __name__ == "__main__":

    tor_ip_changer = TorIpChanger(tor_password='******',
                                  tor_port=9051,
                                  local_http_proxy='127.0.0.1:8118')
    random_query()
示例#5
0
# author = 'BlackSesion'
import base64
import json
import random
import re
import traceback
import urllib
import urllib2

import sys
from scrapy.exceptions import IgnoreRequest
from scrapy.conf import settings
from toripchanger import TorIpChanger

# A Tor IP will be reused only after 10 different IPs were used.
ip_changer = TorIpChanger(reuse_threshold=10)


class RandomUserAgentMiddleware(object):
    def process_request(self, request, spider):
        ua = random.choice(settings.get('USER_AGENT_LIST'))
        if ua:
            request.headers.setdefault('User-Agent', ua)


class ProxyMiddleware(object):
    # overwrite process request
    _requests_count = 0
    _requests_count_x_ip = 10

    def process_request(self, request, spider):
示例#6
0
from scrapemeagain.config import Config
from scrapemeagain.databaser import Databaser
from scrapemeagain.scrapers.examplescraper2.custom_pipeline import (
    ExhaustApiLimitPipeLine,
)  # noqa
from scrapemeagain.scrapers.examplescraper2.scraper import ExampleScraper2
from scrapemeagain.utils import services
from scrapemeagain.utils.logger import setup_logging
from scrapemeagain.utils.useragents import get_user_agents


# Configure TorIpChanger.
tor_ip_changer = TorIpChanger(
    reuse_threshold=0,  # We need to remember all exhausted IPs.
    local_http_proxy=Config.LOCAL_HTTP_PROXY,
    tor_password=Config.TOR_PASSWORD,
    tor_port=Config.TOR_PORT,
    new_ip_max_attempts=Config.NEW_IP_MAX_ATTEMPTS,
)

# Configure useragents.
Config.USER_AGENTS = get_user_agents()

# Configure logging.
setup_logging(logger_name="example-scraper2")


# Prepare the scraping pipeline.
scraper = ExampleScraper2()
databaser = Databaser(scraper.db_file, scraper.db_table)
pipeline = ExhaustApiLimitPipeLine(scraper, databaser, tor_ip_changer)
示例#7
0
 def set_new_ip(self):
     return TorIpChanger(
         reuse_threshold=0,
         tor_password='******',
         tor_port=9051,
         local_http_proxy=self.settings.get('HTTP_PROXY')).get_new_ip()
示例#8
0
from stem.util.log import get_logger

# logger = get_logger()
# logger.propagate = False

# Default settings.
REUSE_THRESHOLD = 1
LOCAL_HTTP_PROXY = "127.0.0.1:8118"
NEW_IP_MAX_ATTEMPTS = 10
TOR_PASSWORD = "******"
TOR_ADDRESS = "127.0.0.1"
TOR_PORT = 9051
POST_NEW_IP_SLEEP = 0.5

ip_changer = TorIpChanger(reuse_threshold=REUSE_THRESHOLD,
                          tor_password=TOR_PASSWORD,
                          tor_port=TOR_PORT,
                          local_http_proxy=LOCAL_HTTP_PROXY)


# Send "Change IP" signal to tor control port
class TorMiddleware(object):
    def __init__(self):
        self.settings = get_project_settings()
        self._requests_count = 0
        self.controller = Controller.from_port(address=TOR_ADDRESS,
                                               port=TOR_PORT)
        self.controller.authenticate(password=TOR_PASSWORD)

    def set_new_ip(self):
        return TorIpChanger(
            reuse_threshold=0,
示例#9
0
from scrapemeagain.config import Config
from scrapemeagain.databaser import Databaser
from scrapemeagain.pipeline import Pipeline
from scrapemeagain.utils import services
from scrapemeagain.utils.logger import setup_logging
from scrapemeagain.utils.useragents import get_user_agents

from examplescraper.scraper import ExampleScraper


# Configure TorIpChanger.
tor_ip_changer = TorIpChanger(
    reuse_threshold=Config.REUSE_THRESHOLD,
    local_http_proxy=Config.LOCAL_HTTP_PROXY,
    tor_password=Config.TOR_PASSWORD,
    tor_port=Config.TOR_PORT,
    new_ip_max_attempts=Config.NEW_IP_MAX_ATTEMPTS,
)

# Configure useragents.
Config.USER_AGENTS = get_user_agents(__file__)

# Configure logging.
setup_logging(logger_name="example-scraper")


# Prepare the scraping pipeline.
scraper = ExampleScraper()
databaser = Databaser(scraper.db_file, scraper.db_table)
pipeline = Pipeline(scraper, databaser, tor_ip_changer)
示例#10
0
#https://stackoverflow.com/questions/43942689/error-while-receiving-a-control-message-socketclosed-empty-socket-content-i
from stem.util.log import get_logger
logger = get_logger()
logger.propagate = False

# https://stackoverflow.com/questions/45009940/scrapy-with-privoxy-and-tor-how-to-renew-ip/45010141
from toripchanger import TorIpChanger
from stem import Signal
from stem.control import Controller
# password handling
import os
from dotenv import load_dotenv
# TOR
TOR_PASSWORD = os.getenv('TOR_PASS')
# A Tor IP will be reused only after 10 different IPs were used.
ip_changer = TorIpChanger(tor_password=TOR_PASSWORD, reuse_threshold=10)


class ProxyMiddleware(object):
    """

    learning about to learn about TOR
    https://github.com/WiliTest/Anonymous-scrapping-Scrapy-Tor-Privoxy-UserAgent
    # setting TOR for the first time on Linux
    https://jarroba.com/anonymous-scraping-by-tor-network/
    config the to for the first time
    https://2019.www.torproject.org/docs/faq#torrc
    about TorIpChanger
    https://gist.github.com/DusanMadar/8d11026b7ce0bce6a67f7dd87b999f6b
    """
    _requests_count = 0
示例#11
0
#  Python powered way to get a unique Tor IP

#  Docs:  https://pypi.org/project/toripchanger/

#  pip install toripchanger

# Basic example
from toripchanger import TorIpChanger

# Tor IP reuse is prohibited.
tor_ip_changer_0 = TorIpChanger(reuse_threshold=0)
current_ip = tor_ip_changer_0.get_new_ip()

# Current Tor IP address can be reused after one other IP was used (default setting).
tor_ip_changer_1 = TorIpChanger(local_http_proxy='127.0.0.1:8888')
current_ip = tor_ip_changer_1.get_new_ip()

# Current Tor IP address can be reused after 5 other Tor IPs were used.
tor_ip_changer_5 = TorIpChanger(tor_address="localhost", reuse_threshold=5)
current_ip = tor_ip_changer_5.get_new_ip()
示例#12
0
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


# A Tor IP will be reused only after 10 different IPs were used.
ip_changer = TorIpChanger(tor_password='******',
                          tor_port=9051,
                          reuse_threshold=300)


class ProxyMiddleware(object):
    def process_request(self, request, spider):
        ip_changer.get_new_ip()
        request.meta['proxy'] = 'http://127.0.0.1:8118'
        spider.log('Proxy : %s' % request.meta['proxy'])