示例#1
0
 def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={},
              timeout=60):
     self.throttle = Throttle(delay)
     self.user_agent = user_agent
     self.proxies = proxies
     self.cache = cache
     self.num_retries = None  # we will set this per request
     self.timeout = timeout
     
     
     LOGIN_URL = 'http://www.jobbole.com/wp-admin/admin-ajax.php'
     LOGIN_EMAIL = 'caicai'
     LOGIN_PASSWORD = '******'
         
         
     postdata = urllib.parse.urlencode({'user_login': LOGIN_EMAIL, 'user_pass': LOGIN_PASSWORD,'action':'user_login'
             ,'remember_me':'1','redirect_url':'http://www.jobbole.com/'}).encode('utf-8')
     req = urllib.request.Request(LOGIN_URL,postdata)
     req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:55.0) Gecko/20100101 Firefox/55.0')
     urllib.request.ProxyHandler(proxies=proxies)
     #create CookieJar
     cjar = http.cookiejar.CookieJar()
     #create opener
     opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar))
     #open 安装为全局
     urllib.request.install_opener(opener)
         
     file = opener.open(req)
     data=file.read()
     file=open('3.html','wb')
     file.write(data)
     file.close() 
示例#2
0
 def __init__(self, since = 1, th_sec = 5):
     if since not in [-1,1,3,7,14]: 
         raise Exception('SinceLevelError: since is out of range [-1,1,3,7,14].')
     self.since = since
     self.thr = Throttle(th_sec)
     self.url_start = 'https://www.otodom.pl/wynajem/mieszkanie/warszawa/?' if self.since == -1 else "https://www.otodom.pl/wynajem/mieszkanie/warszawa/?search%5Bdescription%5D=1&search%5Bcreated_since%5D=" + str(self.since) + "&search%5Bregion_id%5D=7&search%5Bsubregion_id%5D=197&search%5Bcity_id%5D=26" 
     self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.137 Safari/537.36 OPR/67.0.3575.79'
     self.__get_max_page()
     self.links = Queue() 
示例#3
0
def main(sx_url,
         access_log_path=DEFAULT_ACCESS_LOG_PATH,
         upload_interval=DEFAULT_UPLOAD_INTERVAL):
    results = defaultdict(list)  # { date: entries }
    maybe_upload_results = Throttle(upload_all_results, upload_interval)
    for data in listen_and_parse(access_log_path):
        key = data['datetime'].date()
        results[key].append(data)
        maybe_upload_results(results, sx_url)
def link_crawler(start_url,
                 link_regex,
                 robots_url=None,
                 user_agent='wswp',
                 proxies=None,
                 delay=0.0001,
                 max_depth=999999,
                 max_count=10000):
    """ 
    Recorre los link en profundidad 
    """
    i = 0
    crawl_queue = [start_url]
    result = []
    # Dict donde guardare las url visitadas para no volver a parsearlas
    seen = {}
    if not robots_url:
        robots_url = '{}/robots.txt'.format(start_url)
    rp = get_robots_parser(robots_url)
    throttle = Throttle(delay)
    while crawl_queue and i <= 10000:
        url = crawl_queue.pop()

        if rp.can_fetch(user_agent, url):
            depth = seen.get(url, 0)
            if depth == max_depth:
                print('Skipping %s due to depth' % url)
                continue
            if i > max_count:
                print('Skipping %s due to exceed limit count' % url)
                continue
            throttle.wait(url)
            html = download(url, user_agent=user_agent, proxies=proxies)
            if not html:
                continue
            i += 1
            print(i)
            #Devuelve un item parecido a scrapy donde guardo la url y el texto plano, ademas de
            ##guardarlo en un fichero
            yield WikiItem(html, url)

            # Filtramos los link a usar
            for link in get_links(html):
                if re.match('#(a-z)*', link):
                    continue
                if re.match(link_regex, link):
                    # Un pequeno parche que la wiki local al pedirle los link no me ponia esta A
                    # en una pagina online no tuve problema al quitarlo
                    #abs_link2 = urljoin(start_url, 'A/')
                    # abs_link = urljoin(abs_link2, link)
                    abs_link = urljoin(start_url, link)
                    if abs_link not in seen and len(abs_link) < 200:
                        seen[abs_link] = depth + 1
                        crawl_queue.append(abs_link)
        else:
            print('Blocked by robots.txt:', url)
示例#5
0
 def __init__(self, delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, 
             num_retries=DEFAULT_RETRIES, cache=DEFAULT_CACHE, 
             proxies=DEFAULT_PROXIES, opener=DEFAULT_OPENER, 
             timeout=DEFAULT_TIMEOUT):
     socket.setdefaulttimeout(timeout)
     self.throttle = Throttle(delay)
     self.user_agent = user_agent
     self.proxies = proxies
     self.num_retries = num_retries
     self.cache = cache
     self.opener = opener
示例#6
0
 def __init__(self,
              delay=5,
              user_agent=None,
              num_retries=1,
              proxies=None,
              cache=None):
     self.throttle = Throttle(delay)
     self.user_agent = user_agent
     self.proxies = proxies
     self.num_retries = num_retries
     self.cache = cache
def main_link_crawler(start_url,
                      link_regex,
                      robots_url=None,
                      user_agent='bbbbbbb',
                      proxies=None,
                      delay=3,
                      max_depth=4,
                      num_retries=2,
                      cache={}):
    """ Crawl from the given start URL following links matched by link_regex. In the current
        implementation, we do not actually scrapy any information.

        args:
            start_url (str): web site to start crawl
            link_regex (str): regex to match for links
        kwargs:
            robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
            user_agent (str): user agent (default: wswp)
            proxy (str): proxy url, ex 'http://IP' (default: None)
            delay (int): seconds to throttle between requests to one domain (default: 3)
            max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
            scrape_callback (function): function to call after each download (default: None)
    """
    crawl_queue = [start_url]
    # keep track which URL's have seen before
    seen = {}
    data = []
    if not robots_url:
        robots_url = '{}/robots.txt'.format(start_url)
    rp = get_robots_parser(robots_url)
    throttle = Throttle(delay)
    while crawl_queue:
        url = crawl_queue.pop()
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            depth = seen.get(url, 0)
            if depth == max_depth:
                print('Skipping %s due to depth' % url)
                continue
            throttle.wait(url)

            html = download(url, user_agent=user_agent, proxy=proxies)
            if not html:
                continue
            # filter for links matching our regular expression
            for link in get_links(html):
                if re.match(link_regex, link):
                    abs_link = urljoin(start_url, link)
                    if abs_link not in seen:
                        seen[abs_link] = depth + 1
                        crawl_queue.append(abs_link)
        else:
            print('Blocked by robots.txt:', url)
    return seen
示例#8
0
    def __init__(self, delay=1, user_agent='saint_data', cache={}):
        """ __init__ method initializes a Downloader object
            @parameters
                user_agent:     (str)   user agent for request header
                cache:          (dict)  stores all downloaded
        """

        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.num_retries = None  # this variable will be set later by request (in __call__ method)
        self.cache = cache
示例#9
0
 def __init__(self,
              delay=5,
              user_agent='wswp',
              proxies=None,
              cache={},
              timeout=60):
     self.throttle = Throttle(delay)
     self.user_agent = user_agent
     self.proxies = proxies
     self.cache = cache
     self.num_retries = None  # we will set this per request
     self.timeout = timeout
def link_crawler(start_url,
                 link_regex,
                 robots_url=None,
                 user_agent='wswp',
                 proxies=None,
                 delay=5,
                 max_depth=5):
    """ Crawl from the given start URL following links matched by link_regex.
    In the current implementation, we do not actually scrape any information.

        args:
            start_url (str): web site to start crawl
            link_regex (str): regex to match for links
        kwargs:
            robots_url (str): url of the site's robots.txt
                              (default: start_url + /robots.txt)
            user_agent (str): user agent (default: wswp)
            proxies (dict): proxy dict w/ keys 'http' and 'https', values
                            are strs (i.e. 'http(s)://IP') (default: None)
            delay (int): seconds to throttle between requests
                         to one domain (default: 3)
            max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
    """
    crawl_queue = [start_url]
    # keep track which URL's have seen before
    seen = {}
    if not robots_url:
        robots_url = '{}/robots.txt'.format(start_url)
    rp = get_robots_parser(robots_url)
    throttle = Throttle(delay)
    while crawl_queue:
        url = crawl_queue.pop()
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            depth = seen.get(url, 0)
            if depth == max_depth:
                print('Skipping %s due to depth' % url)
                continue
            throttle.wait(url)
            html = download(url, user_agent=user_agent, proxies=proxies)
            if not html:
                continue
            # TODO: add actual data scraping here
            # filter for links matching our regular expression
            for link in get_links(html):
                if re.match(link_regex, link):
                    abs_link = urljoin(start_url, link)
                    if abs_link not in seen:
                        seen[abs_link] = depth + 1
                        crawl_queue.append(abs_link)
        else:
            print('Blocked by robots.txt:', url)
示例#11
0
 def __init__(self,
              user_angent='wsap',
              proxies=None,
              delay=5,
              numTry=5,
              cache=None,
              timeout=30):
     self.user_agent = user_angent
     self.proxies = proxies
     self.delay = delay
     self.numTry = numTry
     self.cache = RedisCache()
     self.throt = Throttle(delay)
     self.timeOut = timeout
示例#12
0
 def __init__(self,
              delay=5,
              user_agent='wswp',
              proxies=None,
              num_retries=1,
              cache=None,
              opener=None,
              timeout=30):
     socket.setdefaulttimeout(timeout)
     self.throttle = Throttle(delay)
     self.user_agent = user_agent
     self.proxies = proxies
     self.num_retries = num_retries
     self.cache = cache
     self.opener = opener
示例#13
0
def link_crawler(seed_url, link_regex=None, delay=10, headers=None, max_depth=2, max_urls=1, user_agent='wswp',
                 proxy=None, num_retries=1):
    """
    :param seed_url: a list of master url
    :param link_regex: you wanna filter the url
    :return: a list of contain master_url and sub url
    """
    crawl_queue = Queue.deque([seed_url])  # 还需要被爬取的url, 类似一个列表
    seen = {seed_url: 0}  # 用于存储根url深度,默认为0,以及其他子url的深度
    num_urls = 0  # 跟踪已经下载的url的数量
    # robots file parse for get really url
    rp = get_robots(seed_url)
    # 生成延时器对象
    throttle = Throttle(delay)
    # 请求头字典
    headers = headers or {}
    if user_agent:
        headers['User-agent'] = user_agent  # 将自定义的用户请求头添加到字典中

    while crawl_queue:  # 只要crawl_queue没被pop从后向前取完,就执行循环
        url = crawl_queue.pop()  # 只取最新的url,由append而来的,取一次少一次。本次爬取的url
        if rp.can_fetch(user_agent, url):  # 判断是否可爬,如果是False是不能爬的
            throttle.wait(url)  # 进行限时,10秒
            html = download(url, headers, proxy=proxy, num_retries=num_retries)  # 下载网页

            links = []  # 用来存储匹配到的子url
            depth = seen[url]  # 取seen这个字典的url的值,其值为深度数字。url为本次爬取的url,取得是本次url的深度值
            if depth != max_depth:  # 控制深度来决定加入待爬队列中的链接深度,没达到深度就可以往crawl_queue里append
                if link_regex:
                    links.extend(link for link in get_link(html) if re.match(link_regex, link))  # 将子url扩展到列表

                for link in links:  # 遍历links列表中的url
                    link = normalize(seed_url, link)  # 将url碎片与根url拼接成完整的链接
                    if link not in seen:  # 有一个新link, 原url深度+1并赋值给link的深度。可以记录该条link是url的第几层,或者第几个次子链接
                        seen[link] = depth + 1  # 深度,url与link之间的相似度,如相似度一样,就表示重复,深度为+1。存下次要下载的深度值
                        if same_domain(seed_url, url):  # 判断domain是否一样,即是域名+端口
                            crawl_queue.append(link)  # 将链接加到待爬的队列中

            # 该link链接是根链接的一个子链接,将子链接加到待爬的队列中,然后num_urls通过控制总的下载次数,来确定爬多少个url,也就是深度
            num_urls += 1  # 控制下载次数
            if num_urls == max_urls:  # 控制循环的次数
                break
        else:
            print 'Blocked by robots.txt:', url
    print seen
示例#14
0
    def __init__(self):
        self.code_type_to_message = {}
        self.code_type_to_type = {}
        self.type_to_code_type = {}

        self.type_to_reserved_count = {}
        self.type_to_released_count = {}

        self.add_message_type(Acknowledge())
        self.add_message_type(Keepalive())
        self.add_message_type(Throttle())
        self.add_message_type(ChallengeRequest())
        self.add_message_type(ChallengeResponse())
        self.add_message_type(JoinRequest())
        self.add_message_type(JoinResponse())
        self.add_message_type(LeaveRequest())
        self.add_message_type(LeaveResponse())
        self.add_message_type(InjectRequest())
        self.add_message_type(InjectResponse())
        self.add_message_type(ModifyRequest())
        self.add_message_type(ModifyResponse())
        self.add_message_type(EjectRequest())
        self.add_message_type(EjectResponse())
        self.add_message_type(InteractRequest())
        self.add_message_type(InteractResponse())
        self.add_message_type(ExamineRequest())
        self.add_message_type(ExamineResponse())
        self.add_message_type(AttachRequest())
        self.add_message_type(AttachResponse())
        self.add_message_type(DetachRequest())
        self.add_message_type(DetachResponse())
        self.add_message_type(HandoverRequest())
        self.add_message_type(HandoverResponse())
        self.add_message_type(ListBubblesRequest())
        self.add_message_type(ListBubblesResponse())
        self.add_message_type(PerceptionEvent())
        self.add_message_type(MovementEvent())
        self.add_message_type(DisappearanceEvent())
        self.add_message_type(HandoverEvent())
        self.add_message_type(ActionEvent())
        self.add_message_type(SynchronizationBeginEvent())
        self.add_message_type(SynchronizationEndEvent())
示例#15
0
def crawl_link(seed_url, link_regex, max_depth = 2, delay = 3, scrape_callback = None):
    crawl_queue = [seed_url]
    seen = {seed_url: 0}
    throttle = Throttle(delay)
    while crawl_queue:
        url = crawl_queue.pop()
        throttle.wait(url)
        html = download(url)
        if html is None:
            return
        
        links = []
        if scrape_callback:
            links.extend(scrape_callback(url, html) or []) #or []:代表追加的是个空列表
        # check is max depth
        depth = seen[url]
        if depth != max_depth:
            for link in get_links(html):
                if re.match(link_regex, link):
                    link = urlparse.urljoin(seed_url, link)
                    # check the has down
                    if link not in seen:
                        seen[link] = depth + 1
                        crawl_queue.append(link)
示例#16
0
import json
import threading
import time
import selenium
import os
from lxml.html import fromstring
import random

from link_crawler import download
from selenium_auth import get_driver, login
from throttle import Throttle
from helpers import jsonify

SLEEP_TIME = 2
CURRENT_DIR = os.getcwd()
throttle = Throttle(3)

# save cookies to json file to prevent multiple log ins
try:
    with open('cookies') as f_in:
        cookies = json.load(f_in)
except FileNotFoundError:
    driver = get_driver()
    login(driver)
    cookies = driver.get_cookies()
    driver.quit()
    with open('cookies', 'w') as f_out:
        json.dump(cookies, f_out)
except:
    print('Something went wrong during log in')
    raise
def rotate_log_files(options):
    with request_lock(options['lock_file']) as acquired:
        if not acquired:
            logger.warn('Not rotating, previous job still underway')
            return

        # Check we can send signals to all relevant processes
        pids_for_processes = running_processes_by_name(
            options['reopen_file_signals'].keys())
        unkillable_processes = set()
        for process_name in options['reopen_file_signals'].keys():
            pids = pids_for_processes[process_name]
            try:
                for pid in pids:
                    kill_if_running(pid, 0)
            except OSError:
                unkillable_processes.add(process_name)
        if unkillable_processes:
            logger.error('Cannot send signal to some processes, aborting: %s' %
                         ', '.join(unkillable_processes))
            return

        files_to_rotate = [
            file for file in os.listdir(options['log_directory'])
            if fnmatch.fnmatch(file, options['filename_filter'])
        ]

        rotation_suffix = datetime.datetime.now().strftime(
            options['timestamp_format'])

        filename_mapping = {
            file: file + rotation_suffix
            for file in files_to_rotate
        }

        # Move all files
        rotated_files = []
        for original_name, rotated_name in filename_mapping.items():
            original_path = os.path.join(options['log_directory'],
                                         original_name)
            rotated_path = os.path.join(options['log_directory'], rotated_name)
            if not os.path.exists(rotated_path):
                os.rename(original_path, rotated_path)
                rotated_files.append(rotated_name)
            else:
                logger.warning(
                    'Did not rotate file. File called %s already existed',
                    rotated_path)

        # Run kick commands
        pids_for_processes = running_processes_by_name(
            options['reopen_file_signals'].keys())
        for process_name, signal_name in options['reopen_file_signals'].items(
        ):
            signal_id = getattr(signal, 'SIG' + signal_name.upper())
            pids = pids_for_processes[process_name]
            for pid in pids:
                kill_if_running(pid, signal_id)

        throttle_file_checks = Throttle(FILE_OPEN_CHECK_INTERVAL)
        checks_without_closed_files = 0
        s3_store = S3LogStore(options)

        # Get files which have no open handles and process them as soon as we can.
        # Files with open handles wait until next time through the loop. We throttle
        # to avoid checking too often.
        # TODO: Should we also pick up and retry copying any gz files which we could not
        #       copy to s3 last time around?
        open_files = rotated_files
        while open_files:
            throttle_file_checks.wait()
            closed_files, open_files = check_for_open_files(open_files)
            for ready_file in closed_files:
                try:
                    ready_path = os.path.join(options['log_directory'],
                                              ready_file)
                    compressed_path = compress_file(ready_path)
                    s3_store.store_file(compressed_path)
                    os.unlink(compressed_path)
                except:
                    logger.error('Unexpected error processing %s',
                                 ready_file,
                                 exc_info=True)
            if len(closed_files):
                checks_without_closed_files = 0
            else:
                checks_without_closed_files += 1
                if checks_without_closed_files > MAX_CHECKS_WITHOUT_FILE_CLOSED:
                    logger.error(
                        'Gave up waiting for files to close. Open files: %s' %
                        ', '.join(open_files))
                    return
示例#18
0
from flask import (abort, after_this_request, Flask, request, render_template,
                   url_for)
from flask_cache import Cache
from flask_jsonpify import jsonify
from raven.contrib.flask import Sentry
from twilio import TwilioRestException

from models import db, aggregate_stats, log_call, call_count, call_list
from political_data import PoliticalData
from cache_handler import CacheHandler
from fftf_leaderboard import FFTFLeaderboard
from access_control_decorator import crossdomain, requires_auth

try:
    from throttle import Throttle
    throttle = Throttle()
except ImportError:
    throttle = None

app = Flask(__name__)

app.config.from_object('config.ConfigProduction')

cache = Cache(app, config={'CACHE_TYPE': 'simple'})
sentry = Sentry(app)

# db.init_app(app) # JL HACK ~ disable mysql

# Optional Redis cache, for caching Google spreadsheet campaign overrides
cache_handler = CacheHandler(app.config['REDIS_URL'])
示例#19
0
文件: main.py 项目: Colaplusice/zhihu
                html = self.cache[url].get("html") if self.cache[url] else None
                if html is None:
                    html = self.downloader.download(url)
                    self.cache.collection.update_one(
                        {"_id": url}, {"$set": {"html": html}}, upsert=True
                    )
                    self.throttle.wait(url)
                else:
                    print("load user html from cache {}".format(current_user))
                user_dict = self.parser.parse2dict(current_user, html)
                self.cache[current_user] = user_dict
            if len(url_queue) < 1000 - finished:
                add_user_in_queue(user_dict)


if __name__ == "__main__":
    downloader = ZhihuDownloader()
    # phone_number = "your phone"
    # password = "******"
    mongo_cache = MongoCache(db_name="zhihu", collection_name="default")
    parser = ZhihuParser()
    saver = ZhihuSaver
    begin = Crawler(
        cache=mongo_cache,
        downloader=downloader,
        parser=parser,
        begin_url=BEGIN_URL,
        throttle=Throttle(delay=1),
    )
    begin.run()
示例#20
0
 def create_message_typecode(message_type_code):
     if message_type_code == MXP_MESSAGE_ACKNOWLEDGE:
         return Acknowledge()
     if message_type_code == MXP_MESSAGE_KEEPALIVE:
         return Keepalive()
     if message_type_code == MXP_MESSAGE_THROTTLE:
         return Throttle()
     if message_type_code == MXP_MESSAGE_CHALLENGE_REQUEST:
         return ChallengeRequest()
     if message_type_code == MXP_MESSAGE_CHALLENGE_RESPONSE:
         return ChallengeResponse()
     if message_type_code == MXP_MESSAGE_JOIN_REQUEST:
         return JoinRequest()
     if message_type_code == MXP_MESSAGE_JOIN_RESPONSE:
         return JoinResponse()
     if message_type_code == MXP_MESSAGE_LEAVE_REQUEST:
         return LeaveRequest()
     if message_type_code == MXP_MESSAGE_LEAVE_RESPONSE:
         return LeaveResponse()
     if message_type_code == MXP_MESSAGE_INJECT_REQUEST:
         return InjectRequest()
     if message_type_code == MXP_MESSAGE_INJECT_RESPONSE:
         return InjectResponse()
     if message_type_code == MXP_MESSAGE_MODIFY_REQUEST:
         return ModifyRequest()
     if message_type_code == MXP_MESSAGE_MODIFY_RESPONSE:
         return ModifyResponse()
     if message_type_code == MXP_MESSAGE_EJECT_REQUEST:
         return EjectRequest()
     if message_type_code == MXP_MESSAGE_EJECT_RESPONSE:
         return EjectResponse()
     if message_type_code == MXP_MESSAGE_INTERACT_REQUEST:
         return InteractRequest()
     if message_type_code == MXP_MESSAGE_INTERACT_RESPONSE:
         return InteractResponse()
     if message_type_code == MXP_MESSAGE_EXAMINE_REQUEST:
         return ExamineRequest()
     if message_type_code == MXP_MESSAGE_EXAMINE_RESPONSE:
         return ExamineResponse()
     if message_type_code == MXP_MESSAGE_ATTACH_REQUEST:
         return AttachRequest()
     if message_type_code == MXP_MESSAGE_ATTACH_RESPONSE:
         return AttachResponse()
     if message_type_code == MXP_MESSAGE_DETACH_REQUEST:
         return DetachRequest()
     if message_type_code == MXP_MESSAGE_DETACH_RESPONSE:
         return DetachResponse()
     if message_type_code == MXP_MESSAGE_HANDOVER_REQUEST:
         return HandoverRequest()
     if message_type_code == MXP_MESSAGE_HANDOVER_RESPONSE:
         return HandoverResponse()
     if message_type_code == MXP_MESSAGE_LIST_BUBBLES_REQUEST:
         return ListBubblesRequest()
     if message_type_code == MXP_MESSAGE_LIST_BUBBLES_RESPONSE:
         return ListBubblesResponse()
     if message_type_code == MXP_MESSAGE_PERCEPTION_EVENT:
         return PerceptionEvent()
     if message_type_code == MXP_MESSAGE_MOVEMENT_EVENT:
         return MovementEvent()
     if message_type_code == MXP_MESSAGE_DISAPPEARANCE_EVENT:
         return DisappearanceEvent()
     if message_type_code == MXP_MESSAGE_HANDOVER_EVENT:
         return HandoverEvent()
     if message_type_code == MXP_MESSAGE_ACTION_EVENT:
         return ActionEvent()
     if message_type_code == MXP_MESSAGE_SYNCHRONIZATION_BEGIN_EVENT:
         return SynchronizationBeginEvent()
     if message_type_code == MXP_MESSAGE_SYNCHRONIZATION_END_EVENT:
         return SynchronizationEndEvent()
 def setUp(self):
     self.mock_time = MockTime()
     self.test_throttle = Throttle(10, time=self.mock_time)
from throttle import Throttle
from download import download, search_codes
from string import ascii_lowercase
import re
import csv
import itertools

throttle = Throttle(0)
"""
So, money.rediff.com has the simplest structure of scripcodes I found. All scrips are categorized by the first alphabet. In this code I am first finding out how many stocks are there starting with each alphabet. The count of the stocks is being recorded in the list 'index' below
"""
index = []
for x in ascii_lowercase:
    throttle.wait('https://money.rediff.com')
    html = str(download('https://money.rediff.com/companies/{}'.format(x)))
    len = re.search('>Showing 1 - (.*?) of (.*?) ', html)
    index.append(int(len.group(2)))
"""
Once I have all the stocks by their alphabet, I am iterating through every page on the structure to find the regex for scripcode, which is a 6 digit number. I know the variables look ugly, but the code is fucntional, and will only be run once in a blue moon. I'll improve on it later on, if I get time. Basically, this is an unintelligent iterative crawler.
"""

ctr = 0
b = []
prod = []
for x in ascii_lowercase:
    throttle.wait('https://money.rediff.com')
    for i in itertools.count(1, 200):
        limit = index[ctr]
        if (i > limit):
            break
        b = search_codes('https://money.rediff.com/companies/{}/{}-{}'.format(
示例#23
0
def link_crawler(start_url,
                 link_regex,
                 robots_url=None,
                 user_agent='statista',
                 max_depth=-1,
                 delay=3,
                 proxies=None,
                 num_retries=2,
                 cache=None,
                 scraper_callback=None):

    #: Initialze a crawl queue with a seed url to start the crawl from
    crawl_queue = [start_url]

    #: keep track of seen urls
    seen = {}

    robots = {}

    throttle = Throttle(delay)

    #: start the crawl
    while crawl_queue:
        url = crawl_queue.pop()

        #: robots.txt
        robots_file_present = False
        if 'http' not in url:
            continue

        #: Get the domain
        domain = '{}://{}'.format(urlparse(url).scheme, urlparse(url).netloc)

        #: Get the robot parser for this domain from the robots dictionary
        robot_parser = robots.get(domain)

        #: set a default robots url and a parser for it if there isn't one
        if not robot_parser and domain not in robots:
            robots_url = '{}/robots.txt'.format(domain)
            robot_parser = get_robots_parser(robots_url)
            if not robot_parser:
                #: continue to crawl even if there are problems finding robots.txt
                #: file
                robots_file_present = True
            # associate each domain with a corresponding parser, whether
            # present or not
            robots[domain] = robot_parser

        elif domain in robots:
            robots_file_present = True

        #: crawl only when url passes robots.txt restrictions
        if robots_file_present or robot_parser.can_fetch(user_agent, url):
            depth = seen.get(url, 0)
            if depth == max_depth:
                #: Skip link if you have crawled it more than max depth
                print('Skipping %s due to depth' % url)
                continue
            throttle.wait(url)
            html = download(url, num_retries=num_retries)
            if not html:
                continue
            if scraper_callback:
                scraper_callback(url, html)

            #: Get all links from page and filter only those matching given pattern
            for link in get_links(html):
                if re.search(link_regex, link):
                    if 'http' not in link:
                        # check if link is well formed and correct
                        if link.startswith('//'):
                            link = '{}:{}'.format(urlparse(url).scheme, link)
                        elif link.startswith('://'):
                            link = '{}{}'.format(urlparse(url).scheme, link)
                        else:
                            link = urljoin(domain, link)

                    if link not in seen:
                        seen[link] = depth + 1
                        crawl_queue.append(link)
        else:
            print('Blocked by robots.txt:', url)
示例#24
0
date_for_load = datetime.datetime.now()
if args.d:
    try:
        args_date = datetime.datetime.strptime(args.d, "%d/%m/%Y")
        # logging.debug("args date is: ")
        # logging.debug(args_date)
        date_for_load = args_date
    except Exception as e:
        logging.error("Invalid command line parameter:")
        logging.error(args)
        logging.error(e)
        date_for_load = datetime.datetime.now()

# init delay
throttle = Throttle(config.delay)

# create new loader instance
ldr_kz_nb = Loader_KZ_NB()
ldr_kz_bai_alfa = Loader_KZ_bai_alfa()
kz_bai_halyk_cash_ldr = Loader_KZ_bai_halyk_cash()
kz_bai_halyk_cards_ldr = Loader_KZ_bai_halyk_cards()
kz_bai_kkb_cash_ldr = Loader_KZ_bai_kkb_cash()
kz_bai_kkb_cards_ldr = Loader_KZ_bai_kkb_cards()

# here is the place for adding an instance into the loaders list
loaders_list = [
    ldr_kz_nb, ldr_kz_bai_alfa, kz_bai_halyk_cash_ldr, kz_bai_halyk_cards_ldr,
    kz_bai_kkb_cash_ldr, kz_bai_kkb_cards_ldr
]