def create_crawler(spider): '''Setups item signal and run the spider''' from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Got:", item dispatcher.connect(catch_item, signal=signals.item_passed) # shut off log from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = False # set up crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() # schedule spider crawler.crawl(spider) return crawler
def service_sis(self): process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) process.crawl(worker.Worker) process.start() # the script will block here until the crawling is finished
def main(argv): try: opts, args = getopt.getopt(argv, "ch:t:s:", ['title=', 'section=']) except getopt.GetoptError: print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>' sys.exit() elif opt == '-c': # start crawling article here print "crawling" process = CrawlerProcess(get_project_settings()) process.crawl(BBCArticleSpider) process.start() elif opt in ('-t', '--title'): print "search by title" # start searching article by title results = BBCArticleItem.fetch_by_title(arg) for result in results: print result elif opt in ('-s', '--section'): print "search by section" # start searching article by section results = BBCArticleItem.fetch_by_section(arg) for result in results: print result
def crawl(spiders_classes, connector, debug=False, spider_error_callback=stdout_error_callback, scrapy_settings=None): """ Launch crawl job for JobSpider class :param scrapy_settings: dict of setting merged with CrawlerProcess default settings :param debug: (bool) Activate or disable debug :param spider_error_callback: callback foir spider errors (see http://doc.scrapy.org/en/latest/topics/signals.html#spider-error) :param connector: Connector instance :param spiders_classes: JobSpider class list :return: spider instance """ if debug: dispatcher.connect(spider_error_callback, signals.spider_error) settings = { 'ITEM_PIPELINES': { 'pyjobs_crawlers.pipelines.RecordJobPipeline': 1, }, 'connector': connector, 'LOG_ENABLED': False, 'DOWNLOAD_DELAY': 1 if not debug else 0, } if scrapy_settings: settings.update(scrapy_settings) process = CrawlerProcess(settings) for spider_class in spiders_classes: process.crawl(spider_class, debug=debug) spiders = [] for crawler in list(process.crawlers): spiders.append(crawler.spider) process.start() return spiders
def main(): """Rutina principal para la ejecución del Spider""" # set up signal to catch items scraped from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Item Extraido:", item dispatcher.connect(catch_item, signal=signals.item_passed) from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = False # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() # definir el spider para el crawler crawler.crawl(BloggerSpider()) # iniciar scrapy print "STARTING ENGINE" crawler.start() print "ENGINE STOPPED"
class CrawlerScript(): def __init__(self): settings = get_project_settings() settings.set('LOG_ENABLED', False, priority='cmdline') #settings.overrides['LOG_ENABLED'] = False self.crawler = CrawlerProcess(settings) self.items = [] SignalManager(dispatcher.Any).connect(self._item_passed, signal=signals.item_scraped) def _item_passed(self,item,response,spider): self.items.append(item) def _crawl(self, q, queue): self.crawler.crawl(BingSpider, q=q) self.crawler.start() self.crawler.stop() queue.put(self.items) def crawl(self, q): queue = Queue() p = Process(target=self._crawl, args=[q, queue]) p.start() p.join() return queue.get(True)
def get_scraped_sites_data(): """Returns output for venues which need to be scraped.""" class RefDict(dict): """A dictionary which returns a reference to itself when deepcopied.""" def __deepcopy__(self, memo): return self # Hack: we pass a dictionary which can't be deep-copied into the settings # so as to _return_ the scraper output. As far as I can tell, this is the # only way to return the scraper output to the script itself. output = RefDict() settings = Settings({ 'LOG_ENABLED': False, 'ITEM_PIPELINES': { 'mgrok.pipelines.JsonWriterPipeline': 1 }, 'PIPELINE_OUTPUT': output, 'USER_AGENT': 'Chrome/41.0.2228.0' }) crawler_process = CrawlerProcess(settings) for spider in SCRAPY_SPIDERS: crawler_process.crawl(spider) crawler_process.start() return output
def run_spider(spider, settings): """Run a spider with given settings""" from scrapy import signals from scrapy.xlib.pydispatch import dispatcher from scrapy.settings import CrawlerSettings def catch_item(sender, item, **kwargs): #log.msg("Got:" + str(item)) pass dispatcher.connect(catch_item, signal=signals.item_passed) """clean storage""" scraperwiki.sqlite.execute("drop table if exists "+spider.name) scraperwiki.sqlite.commit() from scrapy.crawler import CrawlerProcess settings = CrawlerSettings(values=settings) crawler = CrawlerProcess(settings) crawler.install() crawler.configure() crawler.crawl(spider) #log.start(loglevel='DEBUG') crawler.start()
def run(urls, city): process = CrawlerProcess() spiders = [make_spider(artist, url, city) for artist, url in urls] for spider_cls in spiders: process.crawl(spider_cls) # the script will block here until the crawling is finished process.start()
def get(self): while True: process = CrawlerProcess(get_project_settings()) process.crawl('iqiyi') process.start() time.sleep(3000) self.finish()
def handle(self, *args, **options): # It would be better to pass this in as a parameter to PayoutSpider global start_date start_date = datetime.datetime(2015, 1, 1, tzinfo=pytz.UTC) delete = options.get('delete') delete_all = options.get('delete_all') retrieve_all = options.get('retrieve_all') previous_payout = None previous_payouts = codementor_models.Payout.objects.all().order_by('-date') if delete_all or (delete and previous_payouts.count() == 0): codementor_models.Review.objects.all().delete() codementor_models.Session.objects.all().delete() codementor_models.Payout.objects.all().delete() codementor_models.Payment.objects.all().delete() elif delete: previous_payout = previous_payouts[0] codementor_models.Review.objects.filter(date__gt=start_date).delete() codementor_models.Session.objects.filter(started_at__gt=start_date).delete() previous_payout.delete() codementor_models.Payment.objects.filter(payout__isnull=True).delete() if not retrieve_all and previous_payout: start_date = previous_payout.date process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) process.crawl(PayoutSpider) process.start()
def spiderCrawl(bandname): createLink(bandname) settings = get_project_settings() settings.set('USER_AGENT','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)') process = CrawlerProcess(settings) process.crawl(MySpider) process.start()
def run(self): settings = get_project_settings() process = CrawlerProcess(settings) process.crawl('stackoverflow', ) process.start()
def scrape(spider): with transaction.atomic(), reversion.create_revision(): process = CrawlerProcess(DEFAULT_CRAWLER_OPTIONS) process.crawl(spider) # the script will block here until the crawling is finished process.start() return
class MySpiderProcess1(scrapy.Spider): def __init__(self, name, urls): self.name = name self.start_urls = urls scrapy.Spider.__init__(self) def parse(self, response): print('parse response') def _crawl(self): settings = Settings() settings.set('ITEM_PIPELINES', { 'app.pipelines.JsonWriterPipeline': 300 }) self.process = CrawlerProcess(settings) self.process.crawl(self, self.name, self.start_urls) self.process.start() # self.process.stop() # self.process.join() def start(self): p = Process(target=self._crawl) p.start() p.join() # # def start(self): # self._crawl() def stop(self): self.process.stop()
def main(): """Setups item signal and run the spider""" # set up signal to catch items scraped from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Got:", item # shut off log from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = False # set up crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() # schedule spider crawler.crawl(MySpider()) # start engine scrapy/twisted print "STARTING ENGINE" crawler.start() print "ENGINE STOPPED"
def magic(): process = CrawlerProcess(get_project_settings()) # 'followall' is the name of one of the spiders of the project. process.crawl('magic') process.start() # the script will block here until the crawling is fini
def main(tabLink): if(tabLink.find("ultimate-guitar.com")): tabSpider = Spiders.Ultimate(tabLink) elif(tabLink.find("guitartabs.cc")): tabSpider = Spiders.TabCC(tabLink) else: print("Domain name not supported.") return # Make a process to instantiate a Ultimate spider with the given # arguments and make it crawl the link process = CrawlerProcess(get_project_settings()) process.crawl(tabSpider, link=tabLink) process.start() # Link has been scraped, now process it tree = xmltree.parse(tabs.pipelines.filename) root = tree.getroot() value = root[0][0][0] rawTab = value.text if("\M" in rawTab): rawTab = parsefuncs.removeLineEndings(rawTab) cleanTab = parsefuncs.parseTab(rawTab) print("Clean tab is:") count = 0 for line in cleanTab: count += 1 print line if(count % 6 == 0): print(" ")
def main(): """Index alexa demographics """ engine = db_connect() Session = sessionmaker(bind=engine) session = Session() settings = get_project_settings() settings.set('ITEM_PIPELINES', {'demographic_scraper.demographic_scraper.pipelines.WebsiteDemographicPipeline': 300}) settings.set('EXTENSIONS', {'scrapy.telnet.TelnetConsole': None,}) process = CrawlerProcess(settings) for website in session.query(WebsitesContent).all(): demographic = list(session.query(Websites).filter_by(link=website.link)) if len(demographic) is 0: url = website.link print website.link AlexaSpider.name = url process.crawl(AlexaSpider, url=url, db_session=session) process.start() process.stop() session.close()
def _crawl(path=None): crawl = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) crawl.crawl(ProvinceSpider) crawl.start() crawl.stop()
def handle(self, *args, **options): setting = { 'USER_AGENT': options['user_agent'], 'DOWNLOAD_DELAY': options['download_delay'], 'LOG_FILE': settings.SCRAPY_LOG_FILE, 'LOG_LEVEL': settings.SCRAPY_LOG_LEVEL, } if options['proxy_list']: try: f = open(options['proxy_list']) except IOError as e: raise CommandError('cannot open proxy list file for read') # Retry many times since proxies often fail setting['RETRY_TIMES'] = 10 # Retry on most error codes since proxies fail for different reasons setting['RETRY_HTTP_CODES'] = [500, 503, 504, 400, 403, 404, 408] setting['DOWNLOADER_MIDDLEWARES'] = { 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90, 'spider.randomproxy.RandomProxy': 100, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110, } setting['PROXY_LIST'] = options['proxy_list'] process = CrawlerProcess(setting) process.crawl(BaiduSpider) process.start()
def crawl(ctx, spiders, stats): """ Crawl one or many or all pages. What spider(s) to run is determined in the following order: 1. Spider(s) given as argument(s) 2. Spider(s) specified in the configuration file Note that if a spider is given as an argument, the spiders in the configuration file are ignored. All available spiders will be used to crawl if no arguments are given and no spiders are configured. """ settings = ctx.obj["settings"] if stats: settings.set("STATS_CLASS", "scrapy.statscollectors.MemoryStatsCollector") # Start a new crawler process. process = CrawlerProcess(settings) spiders = spiders_to_crawl(process, spiders) if not spiders: logger.error("Please specify what spiders you want to run!") else: for spider in spiders: logger.info("Starting crawl of {} ...".format(spider)) process.crawl(spider) process.start() if settings.getbool("HTTPCACHE_ENABLED"): run_cleanup_cache(settings)
def scrapeando(): from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): """Rellenamos la BD""" for i in enumerate(item.items()): x = i[0] query = "INSERT INTO book (Nombre ,Autor, Editorial ,Fecha, Precio, Link) VALUES ("+decodifica(item['Nombre'][x])+","+decodifica(item['Autor'][x])+","+decodifica(item['Editorial'][x])+","+decodifica(item['Fecha'][x])+","+decodifica(item['Precio'][x])+","+decodifica("http://www.casadellibro.com"+item['Link'][x])+");" db.micursor.execute(query) db.conexion.commit() print item dispatcher.connect(catch_item, signal=signals.item_passed) from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = False from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() book = BookSpider() book.busqueda=unicode(search.getbusqueda()) crawler.crawl(book) print "Start scraping to la Casa del Libro" crawler.start() print "End scraping to la Casa del Libro" crawler.stop()
def ScrapeSite(): db = 'crunchbase_startups' sitedomain = raw_input("Enter site domain: ") # get user input sitedomain = parse_base_url(sitedomain) # clean url sql = 'SELECT text FROM {} WHERE siteurl = %s'.format(db) cur.execute(sql, sitedomain) sitetext = cur.fetch() if sitetext != '': # what does an empty ping return? print 'Site already scraped.' return sitetext process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'ITEM_PIPELINES': {'pipelines.UserInputPipeline': 100}, 'DEPTH_LIMIT': 2, 'DOWNLOAD_HANDLERS': {'s3': None,} ,'LOG_LEVEL': 'INFO' }) process.crawl(SoloSpider, domain = sitedomain) process.start() # presumably finished here - pull newly loaded sitetext for domain cur.execute(sql, sitedomain) return cur.fetch()
def Test_Scapy(self): spider = FtpSpider() process = CrawlerProcess({"USER_AGENT": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"}) process.crawl(spider) process.start()
def execute_task(self, website: Website, busy: Value, post_id: str, comment_id: str): busy.value = 1 if os.path.exists("data.json"): os.remove("data.json") print("Started crawling task") process = CrawlerProcess(get_project_settings()) process.crawl("od_links", base_url=website.url) process.start() print("Done crawling") self.db.import_json("data.json", website) os.remove("data.json") print("Imported in SQLite3") if post_id: # Reply to post stats = self.db.get_website_stats(website.id) comment = self.reddit_bot.get_comment({"": stats}, website.id) print(comment) if "total_size" in stats and stats["total_size"] > 10000000: post = self.reddit_bot.reddit.submission(post_id) self.reddit_bot.reply(post, comment) pass else: self.reddit_bot.log_crawl(post_id) elif comment_id: # Reply to comment stats = self.db.get_website_stats(website.id) comment = self.reddit_bot.get_comment({"There you go!": stats}, website.id) print(comment) reddit_comment = self.reddit_bot.reddit.comment(comment_id) self.reddit_bot.reply(reddit_comment, comment) busy.value = 0 print("Done crawling task")
def runSpiderProcess(spider_cls, *args, **kwargs): """ Helper method that starts a spider with the given init arguments, waits for it to complete, and returns the items it yielded in a list. :param spider_cls: the spider class to run :param args: the indexed arguments to the spider :param kwargs: the keyword arguments to the spider :return: a list of items yielded by the spider """ process = CrawlerProcess() process.crawl(spider_cls, *args, **kwargs) final_result = [] def _nab_item(item): # FIXME: this silly dance of encoding and decoding is to prevent scrapy items from being returned to celery # FIXME: celery can't serialize them, so it throws a rather opaque error, but it's fine with lists and dicts final_result.append(json.loads(scrapy_encoder.encode(item))) for crawler in process.crawlers: crawler.signals.connect(_nab_item, item_scraped) process.start() process.stop() return final_result
def __init__(self, titlesfile = None, platform = None, region = None): # set default encoding to utf8 for parsing and logging # utf-8 characters in console and files # reload(sys) sys.setdefaultencoding('utf8') configure_logging(install_root_handler=False) logging.basicConfig( filename='export.log', filemode = 'a', format='%(levelname)s: %(message)s', level=logging.INFO ) # identify platform # self.platform = platform if self.platform is None: logging.error('No platform found! Pass it as an argument.') return else: platformId = platforms.getId(self.platform) if platformId is None: logging.error('Platform ' + self.platform + ' not supported.') return self.titlesfile = titlesfile self.region = region if self.region is None: self.region = "Worldwide" if titlesfile: titles = [] urls = [] with open( self.titlesfile ) as f: titles = f.read().splitlines() for title in titles: logging.debug('Submitting title:' + title ) urls.append( 'http://mobygames.com/search/quick' + '?q=' + title + '&p=' + platformId + '&search=Go' '&sFilter=1' '&sG=on' '&search_title=' + urllib.quote( title ) + '&search_platform=' + urllib.quote(self.platform) + '&search_region=' + urllib.quote(self.region) ) process = CrawlerProcess(get_project_settings()) process.crawl(MobygamesSpider, start_urls=urls) process.start() else: logging.warning('No file.')
def news_flash_crawl(rss_link, site_name, maps_key): id_flash = get_latest_id_from_db() + 1 latest_date = get_latest_date_from_db() d = feedparser.parse(rss_link) process = CrawlerProcess() for entry in d.entries[::-1]: entry_parsed_date = datetime.strptime(entry.published[:-6], '%a, %d %b %Y %H:%M:%S') entry_parsed_date = entry_parsed_date.replace(tzinfo=None) if (latest_date is not None and entry_parsed_date > latest_date) or latest_date is None: news_item = {'id_flash': id_flash, 'date_parsed': entry_parsed_date, 'title': entry.title, 'link': entry.links[0].href, 'date': entry.published, 'location': '', 'lat': 0, 'lon': 0} if (u'תאונ' in entry.title and u'תאונת עבודה' not in entry.title and u'תאונות עבודה' not in entry.title)\ or ((u'רכב' in entry.title or u'אוטובוס' in entry.title or u"ג'יפ" in entry.title or u'משאית' in entry.title or u'קטנוע' in entry.title or u'אופנוע' in entry.title or u'אופניים' in entry.title or u'קורקינט' in entry.title or u'הולך רגל' in entry.title or u'הולכת רגל' in entry.title or u'הולכי רגל' in entry.title) and (u'נפגע' in entry.title or u'פגיע' in entry.title or u'נפצע' in entry.title or u'פציע' in entry.title or u'התנגש' in entry.title or u'התהפך' in entry.title or u'התהפכ' in entry.title)): news_item['accident'] = True else: news_item['accident'] = False if site_name == 'ynet': news_item['source'] = 'ynet' process.crawl(YnetFlashScrap, entry.links[0].href, news_item=news_item, maps_key=maps_key) id_flash = id_flash + 1 process.start()
def crawl(spiders_classes, connector, debug=False, spider_error_callback=stdout_error_callback): """ Launch crawl job for JobSpider class :param debug: (bool) Activate or disable debug :param spider_error_callback: callback foir spider errors (see http://doc.scrapy.org/en/latest/topics/signals.html#spider-error) :param connector: Connector instance :param spiders_classes: JobSpider class list :return: spider instance """ if debug: dispatcher.connect(spider_error_callback, signals.spider_error) process = CrawlerProcess({ 'ITEM_PIPELINES': { 'pyjobs_crawlers.pipelines.RecordJobPipeline': 1, }, 'connector': connector, 'LOG_ENABLED': False }) for spider_class in spiders_classes: process.crawl(spider_class) spiders = [] for crawler in list(process.crawlers): spiders.append(crawler.spider) process.start() return spiders
from wikipedia.spiders import WikipediaSpider from scrapy.crawler import CrawlerProcess import networkx as nx import matplotlib.pyplot as plt import urllib.parse if __name__ == "__main__": crawl_depth = 2 process = CrawlerProcess({ 'LOG_LEVEL': 'ERROR', 'DEPTH_LIMIT': crawl_depth }) process.crawl(WikipediaSpider) spider = next(iter(process.crawlers)).spider spider.max_items_per_page = 5 spider.max_crawl_depth = crawl_depth process.start() for pm in spider.linked_pages: print(pm.depth, pm.link, pm.child_link) print("-" * 80) g = nx.Graph() nodes = {} edges = {} for pm in spider.linked_pages: if pm.title not in nodes: nodes[pm.title] = pm g.add_node(pm.title)
from scrapy.crawler import CrawlerProcess from news.spiders.adevaru_spider import AdevarulSpider from news.spiders.hotnews_spider import HotnewsSpider from news.spiders.agerpress_spider import AgerpressSpider from news.spiders.digi_spider import DigiSpider from news.spiders.tvr_spider import TVRSpider from news.spiders.protv_spider import ProTVSpider from news.spiders.realitatea_spider import RealitateaSpider from scrapy.utils.project import get_project_settings process = CrawlerProcess(get_project_settings()) process.crawl(AdevarulSpider) process.crawl(RealitateaSpider) process.crawl(ProTVSpider) process.crawl(HotnewsSpider) process.crawl(AgerpressSpider) process.crawl(DigiSpider) process.crawl(TVRSpider) process.start()
import asyncio from twisted.internet import asyncioreactor asyncioreactor.install(asyncio.get_event_loop()) import scrapy from scrapy.crawler import CrawlerProcess class NoRequestsSpider(scrapy.Spider): name = 'no_request' def start_requests(self): return [] process = CrawlerProcess(settings={ "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", }) process.crawl(NoRequestsSpider) process.start()
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', 'ITEM_PIPELINES': { 'pipelines.SaveUserReviewPipeline': 300 }, 'DOWNLOADER_MIDDLEWARES': { 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, }, 'POSTGRES_HOST': 'localhost', 'POSTGRES_PORT': '25432', 'POSTGRES_DB': 'mob', 'POSTGRES_USER': '******', 'POSTGRES_PASSWORD': '******' }) if len(sys.argv) == 1: scheduler = TwistedScheduler() scheduler.add_job(process.crawl, 'interval', args=[UserReviewSpider, lambda: start_objs()], seconds=30) scheduler.start() process.start(False) else: process.crawl(UserReviewSpider, lambda: start_objs()) process.start()
from tutorial.spiders.SpiderPost import SpiderPost from scrapy.crawler import CrawlerProcess from scrapy import * process = CrawlerProcess({'USER_AGENT': 'bigyasuo/qq-1801041646'}) csharp = "https://search.51job.com/list/040000,000000,0000,00,9,99,c%2523,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=" dotnet = "https://search.51job.com/list/040000,000000,0000,00,9,99,.net,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=" shanghai_csharp = "https://search.51job.com/list/020000,000000,0000,00,9,99,c%2523,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=" shanghai_dotnet = "https://search.51job.com/list/020000,000000,0000,00,9,99,.net,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=" # urls=[csharp,dotnet] urls = [shanghai_csharp, shanghai_dotnet] keys = ['c#', '.net'] process.crawl(SpiderPost, urls, 10000, keys) process.start() # the script will block here until the crawling is finished
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description=__name__) # Specify what type of crawl we want to do crawl_type = parser.add_mutually_exclusive_group(required=True) crawl_type.add_argument("--site", "-s", help="Name of site to crawl.") crawl_type.add_argument("--all", "-a", action="store_true", help="Crawl all sites.") crawl_type.add_argument( "--list", "-l", help= "CSV file of URLs to crawl with an 'article_url' column and a 'site name' column." ) # General options parser.add_argument( "--max_articles", "-n", type=int, default=0, help="Maximum number of articles to process from each site.") parser.add_argument("--exporter", "-e", default="file", choices=["file", "blob"], help="Article export method.") parser.add_argument("--no-digest", action="store_true", help="Disable content digests.") parser.add_argument("--no-index", action="store_true", help="Disable node indexes.") args = parser.parse_args() # Set up logging configure_logging() logging.getLogger("azure.storage.common.storageclient").setLevel( logging.ERROR) logging.getLogger("sqlalchemy").setLevel(logging.ERROR) # Load crawler settings and apply local overrides settings = get_project_settings() settings.update({ 'ARTICLE_EXPORTER': args.exporter, 'CONTENT_DIGESTS': (not args.no_digest), 'NODE_INDEXES': (not args.no_index), }) # Apply an item limit if specified if args.max_articles: settings.update({'CLOSESPIDER_ITEMCOUNT': args.max_articles}) # Set up a crawler process process = CrawlerProcess(settings) # Load crawler configurations for all sites site_configs = yaml.load(pkg_resources.resource_string( __name__, "site_configs.yml"), Loader=yaml.FullLoader) article_override_lists = yaml.load(pkg_resources.resource_string( __name__, "article_override_lists.yml"), Loader=yaml.FullLoader) for site_name in article_override_lists: site_configs[site_name][ "article_override_list"] = article_override_lists[site_name] # Crawl a single site # ------------------- if args.site: # Create a dynamic spider class and register it with the crawler spider_class = dynamic_spider_class(site_configs[args.site], args.max_articles) process.crawl(spider_class, config=site_configs[args.site]) # Crawl all sites # --------------- elif args.all: for site_name in site_configs: # Create a dynamic spider class and register it with the crawler spider_class = dynamic_spider_class(site_configs[site_name], args.max_articles) process.crawl(spider_class, config=site_configs[site_name]) # Crawl all URLs from a CSV file # ------------------------------ elif args.list: # Load articles from CSV into a dictionary article_urls = defaultdict(list) with open(args.list, "r") as csvfile: dialect = csv.Sniffer().sniff(csvfile.read(50)) csvfile.seek(0) reader = csv.DictReader(csvfile, dialect=dialect) if not all( [f in reader.fieldnames for f in ["article_url", "site_name"]]): raise ValueError( "CSV input must have an 'article_url' column and a 'site name' column" ) for row in reader: article_urls[row["site_name"]].append(row["article_url"]) # Iterate over each site for site_name in sorted(article_urls.keys()): # Override the configuration for the specified site site_config = site_configs[site_name] site_config["start_url"] = "" site_config["article_override_list"] = article_urls[site_name] # Create a dynamic spider class and register it with the crawler spider_class = dynamic_spider_class(site_config, args.max_articles) process.crawl(spider_class, config=site_config) # Start the crawler process.start()
columns['question'].append(question_text) columns['answer'].append(answer_text) columns['answer_html'].append(answer_html) today = date.today() columns["link"] = [ "https://www.who.int/news-room/q-a-detail/q-a-coronaviruses" ] * len(columns["question"]) columns["name"] = ["Q&A on coronaviruses (COVID-19)"] * len( columns["question"]) columns["source"] = ["Robert Koch Institute (RKI)"] * len( columns["question"]) columns["category"] = [""] * len(columns["question"]) columns["country"] = ["DE"] * len(columns["question"]) columns["region"] = [""] * len(columns["question"]) columns["city"] = [""] * len(columns["question"]) columns["lang"] = ["de"] * len(columns["question"]) columns["last_update"] = [today.strftime("%Y/%m/%d")] * len( columns["question"]) return columns if __name__ == "__main__": process = CrawlerProcess( {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}) process.crawl(CovidScraper) process.start()
def handle(self, *args, **options): process = CrawlerProcess(get_project_settings()) # process.crawl(CsSpider) process.crawl(ComeduSpider) process.start()
def handle(self, *args, **options): if options.get("drop_all"): self.warn("Apagando registros...") CityCouncilAgenda.objects.all().delete() CityCouncilAttendanceList.objects.all().delete() CityCouncilMinute.objects.all().delete() CityHallBid.objects.all().delete() Gazette.objects.all().delete() GazetteEvent.objects.all().delete() File.objects.all().delete() dispatcher.connect(self.save, signal=signals.item_passed) os.environ["SCRAPY_SETTINGS_MODULE"] = "scraper.settings" settings = get_project_settings() if options.get("scrapy_args"): scrapy_args = json.loads(options.get("scrapy_args")) settings.update(scrapy_args) process = CrawlerProcess(settings=settings) process.crawl( AgendaSpider, start_from_date=CityCouncilAgenda.last_collected_item_date(), ) process.crawl( AttendanceListSpider, start_from_date=CityCouncilAttendanceList.last_collected_item_date(), ) process.crawl( MinuteSpider, start_from_date=CityCouncilMinute.last_collected_item_date() ) process.crawl( BidsSpider, start_from_date=CityHallBid.last_collected_item_date() ) last_collected_gazette = Gazette.last_collected_item_date() if last_collected_gazette is None: process.crawl(LegacyGazetteSpider) process.crawl( ExecutiveAndLegislativeGazetteSpider, start_from_date=last_collected_gazette, ) self.warn("Iniciando a coleta...") process.start() self.success("Pronto!")
else: request = 'http://maps.google.com/maps/api/geocode/json?address={},+{},+{}'.format(street, district, city) if idx > 0: sleep(random.randint(2, 10)) # prevent getting blocked from the google API r = requests.get(request) results = r.json()["results"] result_types = list(map(lambda x: ",".join(x["types"]), results)) exact_matches_idx = [index for index, value in enumerate(result_types) if "street_address" in value or "establishment" in value or "premise" in value] if len(exact_matches_idx) != 1: coords.append("ambiguous address") else: idx = exact_matches_idx[0] location = results[idx]["geometry"]["location"] coords.append([location["lat"], location["lng"]]) return coords os.environ["SCRAPY_SETTINGS_MODULE"] = "is24crawler.settings" print("------SETTINGS------ csv path: {}".format(settings.CSV_FILE_PATH)) print("------SETTINGS------ start page: {}".format(settings.PAGE_START)) print("------SETTINGS------ end page: {}".format(settings.PAGE_END)) process = CrawlerProcess(get_project_settings()) process.crawl(Immoscout24Bot) process.start()
import scrapy from scrapy.crawler import CrawlerProcess from CarPriceSpider.CarPriceSpider.spiders.xcar_area import XcarAreaSpider process = CrawlerProcess() process.crawl(XcarAreaSpider) # process.crawl(MySpider2) process.start( ) # the script will block here until all crawling jobs are finished
parser = reqparse.RequestParser() parser.add_argument('query', required=True, help='A search term needs to be provided') parser.add_argument('brand', required=True, help='A search term needs to be provided') args = parser.parse_args() product = parse.urlencode({'query': args.query}) brand = (parse.urlencode({'brand': args.brand})).split("=")[1] find = product + '+' + brand print(find) s = get_project_settings() process = CrawlerProcess(s) process.crawl('ebay', find) process.start() print('Crawling Completed') api.add_resource(SteamSearch, '/query') if __name__ == '__main__': #app.run(host='0.0.0.0',port=5000,debug=True) app_server = WSGIServer(('0.0.0.0', 5000), app) app_server.serve_forever()
from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from media_parse import settings from media_parse.spiders.VK_parser import VkParserSpider group_name = '' method = 'groups.getMembers' method_2 = 'users.getSubscriptions' access_token = '' if __name__ == '__main__': crawl_settings = Settings() crawl_settings.setmodule(settings) crawl_procc = CrawlerProcess(settings=crawl_settings) crawl_procc.crawl(VkParserSpider, group_name, method, method_2, access_token) crawl_procc.start()
from scrapy.utils.project import get_project_settings from scrapy.crawler import CrawlerProcess process = CrawlerProcess(get_project_settings()) process.crawl('xunzi') process.start()
* ____ __ __ __ * * | _ \ ___ / _| \/ | ___ _ __ * * | | | |/ _ \ |_| |\/| |/ _ \| '_ \ * * | |_| | __/ _| | | | (_) | | | | * * |____/ \___|_| |_| |_|\___/|_| |_| * * * *************************************** * DefMon Release 0.1 * * Coded by @__mvalle__ * *************************************** """) parser = argparse.ArgumentParser( description= 'Deface Monitor: recursively crawl a domain and check for defaced pages', epilog= 'Example of use: ./run.py -d mydefaceddomain.com -u http://mydefaceddomain.com/hackedPages/' ) parser.add_argument("--domain", '-d', help="Allowed domain", required=True) parser.add_argument("--url", '-u', help="Start URL", required=True) try: args = parser.parse_args() except: parser.print_help() exit(1) process = CrawlerProcess(get_project_settings()) process.crawl('mySpider', domain=args.domain, start_url=args.url) process.start()
def start_requests(self): for url in self.urls: yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): project = {} project["id"] = response.xpath('/html/body/div[2]/div[1]/div/div/div/div[1]/@data-id').get() project["user_id"] = response.xpath('/html/body/div[2]/div[1]/div/div/div/div[1]/div[1]/div/div[2]/div[1]/div[1]/a/img/@data-id').get() project["title"] = response.xpath('/html/body/div[2]/div[1]/div/div/div/div[1]/div[1]/div/div[2]/div[1]/figcaption/span/text()').get() creative_fields = [] for s_creative_field in response.css('li.ProjectTools-projectField-2yD'): creative_fields.append(s_creative_field.css('a::text').get().lower()) project["creative_fields"] = creative_fields tags = [] for s_tag in response.css('a.ProjectTags-tagLink-Hh_'): tags.append(s_tag.css('a::text').get().lower().strip()) project["tags"] = tags self.projects.append(project) process = CrawlerProcess() process.crawl(ProjectsSpider) process.start()
def run(): process = CrawlerProcess() s = "JOBDIR={}".format(SAVE_PATH + "\\request") process.crawl(RrUserSpider) process.start()
for post in posts: yield { 'text': re.sub( "[\n\r\t]{1,}", "\n", "".join(post.css('.messageText::text').extract()).strip( "\n\t\r")), 'author': post.css('.messageMeta .username.author::text').extract()[0], 'date': post.css('.messageMeta .DateTime::text').extract()[0], 'link': "http://forum.lvivport.com/" + post.css( '.messageMeta .datePermalink::attr(href)').extract()[0] } next_page = response.css('.PageNav a.PageNavNext:not(.hidden)')[0] if next_page: yield response.follow(next_page, self.parse_inner_page) process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'DOWNLOAD_DELAY': '0.25', 'FEED_FORMAT': 'jl', 'FEED_URI': 'output.jsonline', 'FEED_EXPORT_ENCODING': 'utf-8' }) process.crawl(LvivPortScraper) process.start()
for tag in body.select('style'): tag.decompose() text = body.get_text(separator='\n') text = text.replace("\n", " ").replace("\t", " ").replace("\r", " ") return text.lower() web_text = get_text_bs(web_text) exsit_list = checkActivity(act_list, web_text) activities = ', '.join(exsit_list) start_url = ', '.join(self.start_urls) item = {} item['start_url'] = start_url item['activities'] = activities return item process = CrawlerProcess(settings={ "FEEDS": { "data/items_23.json": { "format": "json" }, }, }) process.crawl(ActivitySpider23) process.start()
def main(): target_board = ['NSwitch'] process = CrawlerProcess(get_project_settings()) for board in target_board: process.crawl('PTTCrawler', board=board) process.start()
__author__ = 'LeoDong' from scrapy.utils.log import configure_logging from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from SAECrawlers.spiders.PagesCrawler import PagesCrawler from util import tool # tool.init_database() # tool.init_working_path() # configure_logging() process = CrawlerProcess(get_project_settings()) process.crawl(PagesCrawler) process.start()
Handles any exception that occurs while crawling and reissues a request to the server for the URL which failed. :param failure: Error details """ # Logs all failures self.logger.error(repr(failure)) # Checking the type of failure and handling it accordingly if failure.check(HttpError): self.logger.error('HttpError on %s', failure) elif failure.check(DNSLookupError): # This is the original request self.logger.error('DNSLookupError on %s', failure) elif failure.check(TimeoutError, TCPTimedOutError): self.logger.error('TimeoutError on %s', failure) # Reissuing a request yield scrapy.Request(failure, dont_filter=True, callback=self.download_data_files) # Main program process = CrawlerProcess(get_project_settings()) process.crawl(PathCrawler) process.start() os.remove(PathCrawler.pagination_file.name) os.remove(PathCrawler.file_object.name)
args = parser.parse_args() settings = get_project_settings() if args.db_uri: settings.set('SQLALCHEMY_DATABASE_URI', args.db_uri) if args.user_agents: settings.set('USER_AGENT_FILE', args.user_agents) if args.log_file: settings.set('LOG_FILE', args.log_file) if args.log_level: settings.set('LOG_LEVEL', args.log_level) process = CrawlerProcess(settings) if args.crawler: for each_crawler in args.crawler: process.crawl(each_crawler) elif args.daily: process.crawl('douyu_daily') process.crawl('panda_daily') process.crawl('quanmin_daily') process.crawl('bilibili_daily') else: settings.set('CLOSESPIDER_TIMEOUT', 1000) process.crawl('bilibili') process.crawl('douyu') process.crawl('longzhu') process.crawl('panda') process.crawl('zhanqi') process.crawl('huya') process.crawl('quanmin') process.crawl('huomao')
def start_scraping(start_url, scrap_mode): process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) process.crawl(forum_spider.CategoriesSpider, start_url, scrap_mode) process.start()
custom_settings = { 'DOWNLOAD_DELAY': '10', 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', } def start_requests(self): urls = getUrls() for url in urls: file = url.split("###V###")[1].strip() url = url.split("###V###")[0].strip() yield scrapy.Request( url=url, callback=lambda r, file=file: self.parse(r, file), dont_filter=True) def parse(self, response, file): directory = './data/kickstarter/creator/' filename = '%s.html' % file with open(os.path.join(directory, filename), 'wb') as f: f.write(response.url.strip()) f.write(response.body) print "Starting Crawl" ## Start crawling process and Spider process = CrawlerProcess() process.crawl(AmazonSpider) process.start() process.stop()
# yield hotel_reponse @staticmethod def get_hotelidlist(): try: db_connection = MySQLdb.connect('localhost', 'root', 'welcome', 'hotel_livedb') cursor = db_connection.cursor() sql = 'select hotel_unique_id from desiya_hotels' cursor.execute(sql) records = cursor.fetchall() hotelid_list = [record[0] for record in records] hotelid_list = ['00000002', '00000004', '00000005', '00000007', '00000010', '00000011', '00000012', '00000013', '00000014', '00000015'] return hotelid_list except Exception as e: print("Error to connect db") if __name__ == '__main__': process = CrawlerProcess(settings={ 'FEED_FORMAT': 'json', 'FEED_URI': 'items.json', 'CONCURRENT_REQUESTS': '1', 'DOWNLOAD_DELAY':'5', 'ITEM_PIPELINES':{ 'pipelines.MySQLStorePipeline': 1, } }) process.crawl(YatrapiSpider) process.start()
import scrapy from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from corona_crawler.corona_crawler.spiders.corona import CoronaSpider process = CrawlerProcess(get_project_settings()) process.crawl(CoronaSpider) process.start()
from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from main.spiders.main_spider import MainSpider settings = get_project_settings() settings['ITEM_PIPELINES'] = {'main.pipelines.JsonWriterPipeline': 1} process = CrawlerProcess(settings) process.crawl(MainSpider) process.start()
import scrapy from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from immospider.spiders.immoscout import ImmoscoutSpider process = CrawlerProcess(settings=get_project_settings()) #process.crawl(ImmoscoutSpider, url="https://www.immobilienscout24.de/Suche/S-T/Wohnung-Miete/Berlin/Berlin/-/2,50-/60,00-/EURO--1000,00") process.crawl(ImmoscoutSpider, url="https://www.immobilienscout24.de/Suche/S-T/Wohnung-Kauf/Nordrhein-Westfalen/Dortmund/-/-/-/EURO-50000,00-150000,00?enteredFrom=result_list") process.start() # https://github.com/balzer82/immoscraper/blob/master/immoscraper.ipynb # Input parameter for later #b = 'Sachsen' # Bundesland #s = 'Dresden' # Stadt #k = 'Haus' # Wohnung oder Haus #w = 'Kauf' # Miete oder Kauf #url = 'http://www.immobilienscout24.de/Suche/S-T/P-%s/%s-%s/%s/%s?pagerReporting=true' % (page, k, w, b, s)
from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings import sys process = CrawlerProcess( get_project_settings()) name = ["photos_spider"] process.crawl('photos_spider') process.start()