def main(): default_logging() for x in xrange(500): url = 'http://load.local/grab.html' g = Grab() g.go(url) assert 'grab' in g.response.body
def main(): default_logging() bot = CEURSpider() print '\n######## This is a program used to extract data from CEUR Workshop Proceedings. #######\n' print '\nYou can input the workshop number to get the transformed rdf data into current directory rdfdb.ttl file.\n' print '\nFor Example: \n' \ '\n\t 1). 1513 \t \t \t \t- you will get the transformed rdf data from http://ceur-ws.org/Vol-1513/\n' \ '\n\t 2). 1513-1550 \t \t \t \t- you will get the transformed rdf data between Vol-1513 and Vol-1550\n' \ '\n\t 3). 1513 1540 1560 \t \t \t- you will get the transformed rdf data from Vol-1513, Vol-1540 ' \ 'and Vol-1560\n' vol_numbers = raw_input("Please enter volumes you want to transfer: ") input_urls = [] if re.match(r'^\d+$', vol_numbers): input_urls.append("http://ceur-ws.org/Vol-" + str(vol_numbers) + "/") elif re.match(r'(\d+)-(\d+)$', vol_numbers): vols = vol_numbers.split('-') input_urls = ["http://ceur-ws.org/Vol-" + str(i) + "/" for i in range(int(vols[0]), int(vols[1])+1)] elif re.match(r'^(\d+\s)+\d(\s)?', vol_numbers): numbers = vol_numbers.split() input_urls = ["http://ceur-ws.org/Vol-" + str(i) + "/" for i in numbers] else: raise ValueError('Your input is not valid.') bot.initial_urls = input_urls try: bot.run() except KeyboardInterrupt: pass print(bot.render_stats())
def main(): default_logging() for x in xrange(500): url = 'http://load.local/grab.html' g = Grab() g.go(url) assert 'grab' in g.response.body
def main(): default_logging() bot = CEURSpider() bot.initial_urls = config.input_urls try: bot.run() except KeyboardInterrupt: pass print(bot.render_stats())
def main(spider_name, thread_number=None, slave=False, force_url=None, settings='settings', *args, **kwargs): default_logging(propagate_network_logger=kwargs['propagate_network_logger']) lock_key = None if not slave: lock_key = 'crawl.%s' % spider_name if lock_key is not None: lock_path = 'var/run/%s.lock' % lock_key logger.debug('Trying to lock file: %s' % lock_path) assert_lock(lock_path) config = build_global_config(settings) spider_class = load_spider_class(config, spider_name) spider_config = build_spider_config(spider_class, config) if thread_number is None: thread_number = spider_config.getint('GRAB_THREAD_NUMBER') bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'), task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'), ) if spider_config.get('GRAB_QUEUE'): bot.setup_queue(**spider_config['GRAB_QUEUE']) if spider_config.get('GRAB_CACHE'): bot.setup_cache(**spider_config['GRAB_CACHE']) if spider_config.get('GRAB_PROXY_LIST'): bot.load_proxylist(**spider_config['GRAB_PROXY_LIST']) try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING')) if config.get('GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if config.get('GRAB_SAVE_FATAL_ERRORS'): bot.save_list('fatal', 'var/fatal-%d.txt' % pid) if config.get('GRAB_SAVE_TASK_ADD_ERRORS'): bot.save_list('task-could-not-be-added', 'var/task-add-error-%d.txt' % pid) if config.get('GRAB_SAVE_FINAL_STATS'): open('var/stats-%d.txt' % pid, 'wb').write(stats)
def start_parsing(): default_logging(grab_log=config.GRAB_LOG, network_log=config.NETWORK_LOG) bot = GlobalsourcesCrawler(thread_number=config.THREAD_NUMBER) bot.setup_cache('mysql', database=config.MYSQL_DATABASE, use_compression=True, user=config.MYSQL_USER, passwd=config.MYSQL_PASSWORD) bot.load_proxylist(config.PROXY_LIST, 'text_file', proxy_type='http') try: bot.run() except KeyboardInterrupt: pass if config.DEBUG: bot.save_list('fatal', config.FATAL_ERROR_DUMP) comp_db.session.commit() print bot.render_stats() sys.exit()
def start_parsing(): default_logging(grab_log=config.GRAB_LOG, network_log=config.NETWORK_LOG) bot = GlobalsourcesCrawler(thread_number=config.THREAD_NUMBER) bot.setup_cache('mysql', database=config.MYSQL_DATABASE, use_compression=True, user=config.MYSQL_USER, passwd=config.MYSQL_PASSWORD) bot.load_proxylist(config.PROXY_LIST, 'text_file', proxy_type='http') try: bot.run() except KeyboardInterrupt: pass if config.DEBUG: bot.save_list('fatal', config.FATAL_ERROR_DUMP) comp_db.session.commit() print bot.render_stats() sys.exit()
def main(): default_logging(grab_log="log.txt") fl = open("out.txt", "w") flval = open("outval.txt", "w") bot = CEURSpider() bot.initial_urls = config.input_urls bot.out = fl bot.validate = flval try: bot.run() except KeyboardInterrupt: pass fl.close() flval.close() bot.print_stats() print(bot.render_stats())
def start_parsing(): default_logging(grab_log=config.GRAB_LOG, network_log=config.NETWORK_LOG) bot = RosfirmCrawler(thread_number=config.THREAD_NUMBER) # bot = ProffNoCrawler(thread_number=config.THREAD_NUMBER) #bot.setup_queue('mysql', database='proff_no', use_compression=True, user='******', passwd='proff_no_u7Hy4') bot.setup_cache('mysql', database=config.MYSQL_DB, use_compression=True, user=config.MYSQL_USER, passwd=config.MYSQL_PASS) if config.DEBUG: bot.setup_grab(log_dir=config.LOG_DIR) bot.load_proxylist(config.PROXY_LIST, 'text_file', proxy_type='http') try: bot.run() print bot.render_stats() ## вывод статистики в случае render_stats_on = 1 bot.save_list('fatal', config.FATAL_ERROR_DUMP) except KeyboardInterrupt: if config.DEBUG: bot.save_list('fatal', config.FATAL_ERROR_DUMP) print bot.render_stats() sys.exit()
# -*- coding: utf-8 -*- from grab.spider import Spider, Task from grab import Grab import logging from grab.tools.logs import default_logging default_logging(level=logging.ERROR) THREADS = 1 URLS_FILE = 'urls.txt' FOUND_FILE = 'found.txt' NOT_FOUND_FILE = 'not_found.txt' class CookieSpider(Spider): errors = [] def task_generator(self): self.errors = prepare_errors() with open(URLS_FILE) as f: for url in f: if url.strip(): grab = Grab() grab.setup(url=url) print "Start checking the - ", url yield Task('initial', url=url, grab=grab) def task_initial(self, grab, task):
def main(): default_logging() bot = SpeedSpider(thread_number=30) bot.setup_cache(database='speed_spider', use_compression=True) bot.run() print(bot.render_stats())
name=name, description=info, url=lookbook_url, num_followers=textutils.first_int_word(fans), blog_url=blog_url if len(blog_url) > 0 else None, site_url=website_url if len(website_url) > 0 else None) if created: print "Created a new blog_obj" else: print "Object already existed" if __name__ == '__main__': # change the current dir os.chdir(os.path.dirname(os.path.abspath(__file__))) # set up logging for the scraper (empty error.log and empty network.log - is a good sign) # set level=10 to log all events default_logging(grab_log='/tmp/errors.log', level=20, mode='w', propagate_network_logger=False, network_log='/tmp/network.log') # prepare for the battle bot = LookbookScraper(thread_number=THREAD_NUMBER, network_try_limit=3) try: # good luck and have fun! bot.run() finally: # show stats print bot.render_stats()
def main(spider_name, thread_number=None, slave=False, settings='settings', network_logs=False, *args, **kwargs): default_logging(propagate_network_logger=network_logs) lock_key = None if not slave: lock_key = 'crawl.%s' % spider_name if lock_key is not None: lock_path = 'var/run/%s.lock' % lock_key logger.debug('Trying to lock file: %s' % lock_path) assert_lock(lock_path) config = build_global_config(settings) spider_class = load_spider_class(config, spider_name) spider_config = build_spider_config(spider_class, config) if hasattr(spider_class, 'setup_extra_args'): parser = ArgumentParser() spider_class.setup_extra_args(parser) extra_args, trash = parser.parse_known_args() spider_config['extra_args'] = vars(extra_args) if thread_number is None: thread_number = spider_config.getint('GRAB_THREAD_NUMBER') stat_task_object = kwargs.get('stat_task_object', None) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'), task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'), ) if spider_config.get('GRAB_QUEUE'): bot.setup_queue(**spider_config['GRAB_QUEUE']) if spider_config.get('GRAB_CACHE'): bot.setup_cache(**spider_config['GRAB_CACHE']) if spider_config.get('GRAB_PROXY_LIST'): bot.load_proxylist(**spider_config['GRAB_PROXY_LIST']) if spider_config.get('GRAB_COMMAND_INTERFACES'): for iface_config in spider_config['GRAB_COMMAND_INTERFACES']: bot.controller.add_interface(**iface_config) # Dirty hack # FIXIT: REMOVE bot.dump_spider_stats = kwargs.get('dump_spider_stats') bot.stats_object = kwargs.get('stats_object') try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING')) if config.get('GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if config.get('GRAB_SAVE_FATAL_ERRORS'): bot.save_list('fatal', 'var/fatal-%d.txt' % pid) if config.get('GRAB_SAVE_TASK_ADD_ERRORS'): bot.save_list('task-could-not-be-added', 'var/task-add-error-%d.txt' % pid) if config.get('GRAB_SAVE_FINAL_STATS'): open('var/stats-%d.txt' % pid, 'wb').write(stats) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
# -*- coding: utf-8 -*- """ Pravda news articles spy """ from grab.tools.logs import default_logging from spiders.pravda_archive import PravdaArchiveSpider from config import default_spider_params if __name__ == '__main__': default_logging() print "Scape python projects" bot = PravdaArchiveSpider(**default_spider_params()) bot.setup_grab(timeout=4096, connect_timeout=10) bot.run() print bot.render_stats()
# -*- coding: utf-8 -*- """ Github projects spy """ from optparse import OptionParser from grab import Grab from grab.spider import Spider, Task from grab.tools.logs import default_logging from spiders.explore import ExploreSpider from spiders.lang_python import LangPythonSpider from config import default_spider_params, Session if __name__ == '__main__': default_logging() parser = OptionParser() # command line options parser.add_option("-p", "--python", action="store_true", dest="parse_python", default=False) options, args = parser.parse_args() if options.parse_python: print "Scape python projects" bot = LangPythonSpider(**default_spider_params()) else: print "Scrape trandings" bot = ExploreSpider(**default_spider_params())
def __init__(self): default_logging() logDir = '/tmp/fanduel' if not os.path.exists(logDir): os.makedirs(logDir) self.grab = Grab(log_dir=logDir, debug_post=True)
from random import choice from datetime import datetime from dateutil.parser import parse as parse_iso_date from urlparse import urlparse from grab.spider import Spider, Task from grab import Grab from grab.tools.logs import default_logging from fetcherbase import Fetcher from debra import models from platformdatafetcher.activity_levels import recalculate_activity_level #???????????????? default_logging(grab_log='/tmp/grab.log', level=10, mode='w', propagate_network_logger=False, network_log='/tmp/grab_network.log') class BlogspotFetcher(Spider, Fetcher): name = 'Blogspot' # The names of months (for parsing of the date) months_names = { 'JANUARY': 1, 'JAN': 1, 'FEBRUARY': 2, 'FEB': 2, 'MARCH': 3,
def main(): default_logging() bot = SpeedSpider(thread_number=30) bot.setup_cache(database='speed_spider', use_compression=True) bot.run() print(bot.render_stats())
import csv import logging import os from grab.spider import Spider, Task from grab.tools import html from grab.tools.logs import default_logging from hashlib import sha1 from grab import Grab g = Grab() default_logging(level=logging.DEBUG) path = os.path.dirname(os.path.abspath(__file__)) MAIN_LINK = 'http://www.immobilienscout24.de/Suche/S-T/P-{}/Wohnung-Miete/Berlin/Berlin' THREADS = 2 class Immospider(Spider): def __init__(self): super(Immospider, self).__init__(thread_number=THREADS, network_try_limit=20) self.result_file = csv.writer(open('result.csv', 'w')) self.result_file.writerow(['Title', 'Address', 'Wohnungstyp', 'Etage', 'Wohnflaeche', 'Bezugsfrei_ab', 'Zimmer', 'Haustiere', 'Kaltmiete', 'Nebenkosten', 'Heizkosten', 'Gesamtmiete', 'Kaution_o_genossenschaftsanteile', 'URL'])
def main(spider_name, thread_number=None, slave=False, settings='settings', network_logs=False, disable_proxy=False, *args, **kwargs): default_logging(propagate_network_logger=network_logs) lock_key = None if not slave: lock_key = 'crawl.%s' % spider_name if lock_key is not None: lock_path = 'var/run/%s.lock' % lock_key logger.debug('Trying to lock file: %s' % lock_path) assert_lock(lock_path) config = build_global_config(settings) spider_class = load_spider_class(config, spider_name) spider_config = build_spider_config(spider_class, config) if hasattr(spider_class, 'setup_extra_args'): parser = ArgumentParser() spider_class.setup_extra_args(parser) extra_args, trash = parser.parse_known_args() spider_config['extra_args'] = vars(extra_args) if thread_number is None: thread_number = spider_config.getint('GRAB_THREAD_NUMBER') stat_task_object = kwargs.get('stat_task_object', None) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'), task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'), ) if spider_config.get('GRAB_QUEUE'): bot.setup_queue(**spider_config['GRAB_QUEUE']) if spider_config.get('GRAB_CACHE'): bot.setup_cache(**spider_config['GRAB_CACHE']) if spider_config.get('GRAB_PROXY_LIST'): if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**spider_config['GRAB_PROXY_LIST']) if spider_config.get('GRAB_COMMAND_INTERFACES'): for iface_config in spider_config['GRAB_COMMAND_INTERFACES']: bot.controller.add_interface(**iface_config) # Dirty hack # FIXIT: REMOVE bot.dump_spider_stats = kwargs.get('dump_spider_stats') bot.stats_object = kwargs.get('stats_object') try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING')) if spider_config.get('GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if config.get('GRAB_SAVE_REPORT'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) bot.save_list('fatal', '%s/fatal.txt' % dir_) bot.save_list('task-count-rejected', '%s/task_count_rejected.txt' % dir_) bot.save_list('network-count-rejected', '%s/network_count_rejected.txt' % dir_) bot.save_list('task-with-invalid-url', '%s/task_with_invalid_url.txt' % dir_) with open('%s/report.txt' % dir_, 'wb') as out: out.write(stats) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
def main(spider_name, thread_number=None, slave=False, settings='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, *args, **kwargs): default_logging(propagate_network_logger=network_logs) root_config = build_root_config(settings) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, trash = parser.parse_known_args() spider_args = vars(opts) if thread_number is None: thread_number = \ int(spider_config.get('thread_number', deprecated_key='GRAB_THREAD_NUMBER')) stat_task_object = kwargs.get('stat_task_object', None) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=int(spider_config.get( 'network_try_limit', deprecated_key='GRAB_NETWORK_TRY_LIMIT')), task_try_limit=int(spider_config.get( 'task_try_limit', deprecated_key='GRAB_TASK_TRY_LIMIT')), args=spider_args, ) opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get( 'proxy_list', deprecated_key='GRAB_PROXY_LIST') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get( 'command_interfaces', deprecated_key='GRAB_COMMAND_INTERFACES') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) # Dirty hack # FIXIT: REMOVE bot.dump_spider_stats = kwargs.get('dump_spider_stats') bot.stats_object = kwargs.get('stats_object') try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats( timing=spider_config.get('display_timing', deprecated_key='GRAB_DISPLAY_TIMING')) if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if not disable_report: if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) for key, lst in bot.items.iteritems(): fname_key = key.replace('-', '_') bot.save_list(key, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(stats) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
from models import Data, session, table_name import json import logging import os import feedparser from datetime import datetime from grab.spider import Spider, Task from grab.tools import html from grab.tools.logs import default_logging from hashlib import sha1 default_logging(level=logging.DEBUG) path = os.path.dirname(os.path.abspath(__file__)) URLS_FILE = os.path.join(path, 'urls.txt') RSS_LINK = 'http://pathmark.inserts2online.com/rss.jsp?drpStoreID={0}' IMAGE_DIR = os.path.join(path, 'images/') THREADS = 2 class RSSspider(Spider): def __init__(self): super(RSSspider, self).__init__(thread_number=THREADS, network_try_limit=20)
row = {} for childelems in elem.iterchildren(): if 'b-serp-item__price' == childelems.attrib['class']: row['price'] = find_node_number(childelems, ignore_spaces=True) if 'b-serp-item__header' == childelems.attrib['class']: row['header'] = get_node_text(childelems) ahref = childelems.iterchildren() row['link'] = list(ahref)[0].get('href') if 'b-serp-item__about' == childelems.attrib['class']: row['about'] = get_node_text(childelems) if 'b-serp-item__address' == childelems.attrib['class']: adresselems = childelems.iterchildren() adress_and_subway = list(adresselems)[1] adress = adress_and_subway.text adress_and_subway_iter = adress_and_subway.iterchildren() subway = list(adress_and_subway_iter)[0].text row['adress'] = adress row['subway'] = subway if 'b-serp-item__owner' == childelems.attrib['class']: row['owner'] = get_node_text(childelems) row['time'] = int(time.time()) self.csvfilesaver.save(listrow(row)) grab.url.split('=page') if __name__ == '__main__': default_logging(grab_log='/tmp/grab.log', level=logging.DEBUG, mode='a', propagate_network_logger=False, network_log='/tmp/grab.network.log') #TODO Put initial URL and filename in to constructor. #TODO Add SQL saver. bot = YarSpider() bot.run()
def main(spider_name, thread_number=None, slave=False, settings='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, *args, **kwargs): default_logging(propagate_network_logger=network_logs) root_config = build_root_config(settings) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, trash = parser.parse_known_args() spider_args = vars(opts) if thread_number is None: thread_number = \ int(spider_config.get('thread_number', deprecated_key='GRAB_THREAD_NUMBER')) stat_task_object = kwargs.get('stat_task_object', None) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=int( spider_config.get('network_try_limit', deprecated_key='GRAB_NETWORK_TRY_LIMIT')), task_try_limit=int( spider_config.get('task_try_limit', deprecated_key='GRAB_TASK_TRY_LIMIT')), args=spider_args, ) opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get('proxy_list', deprecated_key='GRAB_PROXY_LIST') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get('command_interfaces', deprecated_key='GRAB_COMMAND_INTERFACES') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) # Dirty hack # FIXIT: REMOVE bot.dump_spider_stats = kwargs.get('dump_spider_stats') bot.stats_object = kwargs.get('stats_object') try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=spider_config.get( 'display_timing', deprecated_key='GRAB_DISPLAY_TIMING')) if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if not disable_report: if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) for key, lst in bot.items.iteritems(): fname_key = key.replace('-', '_') bot.save_list(key, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(stats) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
def __init__(self): default_logging() logDir = '/tmp/fanduel' if not os.path.exists(logDir): os.makedirs(logDir) self.grab = Grab(log_dir=logDir, debug_post=True)