def __init__(self, spider, bufsize=4096): super(RealtimeVideoWriter, self).__init__(spider, bufsize) self.set_name('RealtimeVideoWriter') from ..base.start_url_loads import StartUrlsLoader self.extend_map_handler_ = ExtendMapHandler.get_instance( StartUrlsLoader.get_instance('../start_urls/')) self.using_links_extract_ = get_project_settings().get('USING_EXTRACT_LINKS', False) self.new_links_extract = LinksExtractor('le_crawler.common.cdesktop_settings') # rabittmq self.ips_ = ['10.150.140.78', '10.150.140.77', '10.150.140.79'] self.port_ = 5672 self.exchange = 'hbase.exchange' self.queue = 'hbase.search2.realtime.queue' self.channel = self._get_channel() assert self.channel # othres self.total_send = 0 self.debug = get_project_settings()['DEBUG_MODEL'] self.tbl_n = 'crawler_video' self.connect_ = None self.mysql_host_ = '10.150.140.80' self.mysql_port_ = 3306 self.mysql_passwd_ = 'search@letv' self.mysql_usr_ = 'search' self.mysql_db_ = 'crawler'
def __init__(self, spider): PageWriterBase.__init__(self, spider) self._init( get_project_settings().getint('LOCAL_PAGE_WRITER_DATA_TIME_LIMIT', 86400), get_project_settings().getint('LOCAL_PAGE_WRITER_DATA_FLUSH_LIMIT', 20000), get_project_settings().get('LOCAL_PAGE_WRITER_DATA_DIR', '/letv/crawler_delta/') ) self.set_name('PageLocalJsonWriter')
def mongodb(): dbname = get_project_settings().get('MONGO_PIPELINE_DBNAME', 'scrapyh') dbhost = get_project_settings().get('MONGO_PIPELINE_HOST', 'localhost') clt = MongoClient(dbhost) db = clt[dbname] try: yield db finally: clt.close()
def __init__(self, spider): PageWriterBase.__init__(self, spider) if get_project_settings()['DEBUG_MODEL']: self._init(86400, 1000, '/tmp/crawler_delta/') self.set_name('PageLocalJsonWriterTest') else: self._init( get_project_settings().getint('LOCAL_PAGE_WRITER_DATA_TIME_LIMIT', 86400), get_project_settings().getint('LOCAL_PAGE_WRITER_DATA_FLUSH_LIMIT', 20000), get_project_settings().get('LOCAL_PAGE_WRITER_DATA_DIR', '/letv/crawler_delta/') ) self.set_name('PageLocalJsonWriter')
def __init__(self, spider): super(CDesktopWriter, self).__init__(spider) self.set_name('CDesktopWriter') self.data_reciver_ip = '10.180.92.206' self.data_port = 10086 self.total_send_count = 0 self.type_c = get_project_settings().get('CD_WRITE_TYPE', 'http') self.post_url = get_project_settings().get('CD_WRITE_POST_URL', 'http://10.180.92.206:9998/bigdata/post/webpage') self.connection = None self.__create_data_pipe() self.retry_time_max = 10
def __prepare_seed_list(self, tournament, year, month): whoscored_feed_url = get_project_settings().get('WHOSCORED_FEED_URL') if tournament and year and month: return [whoscored_feed_url % (tournament, year, month.zfill(2))] tournaments = get_project_settings().get('TOURNAMENTS') years = get_project_settings().get('TOURNAMENT_YEARS') dates = [(years[0], month) for month in xrange(06, 12)] #dates.extend([(years[1], month) for month in xrange(01, 06)]) return [whoscored_feed_url % (tournament, year, str(month).zfill(2)) for tournament in tournaments for (year, month) in dates]
def __init__(self): """ Initialise la connexion au serveur MySQL """ settings = get_project_settings() self.conn = MySQLdb.connect(user=settings.get('DB_USER'), passwd=settings.get('DB_PASSWD'), db=settings.get('DB_DATABASE'), host=settings.get('DB_HOST'), charset="utf8", use_unicode=True) self.cursor = self.conn.cursor() pdfPath = get_project_settings().get('PDF_DIR') self.pdfPath = pdfPath if not os.path.isdir(pdfPath): # vérification de l'existance du répertoire contenant les PDF. os.makedirs(pdfPath) # on créé le répertoire. self.fullRepositoryExist = False
def initStartUrls(self): # self.kkcardb = sqlite3.connect(self.dbName) # self.cursor = self.kkcardb.cursor() #连接MySQL settings = get_project_settings() host = settings.get('MYSQL_HOST') port = settings.get('MYSQL_PORT') user = settings.get('MYSQL_USER') passwd = settings.get('MYSQL_PASSWD') dbName = settings.get('MYSQL_DBNAME') dbTool = connMySQL(host, int(port), user, passwd, dbName) self.kkcardb = dbTool[0] self.cursor = dbTool[1] #sqlite3中方法 # seriesIdList = self.cursor.execute('select id from t_bx_car_series') # # for seriesId in seriesIdList: # start_url = 'http://www.16888.com/'+str(seriesId[0]) # self.start_urls.append(start_url) # print('start_urls='+start_url) #mysql方法 count = self.cursor.execute('select id from t_bx_car_series') print('count='+str(count)) seriesList = self.cursor.fetchall() for seriesId in seriesList: start_url = 'http://www.16888.com/'+str(seriesId[0]) self.start_urls.append(start_url) print('start_urls='+start_url) self.cursor.close() self.kkcardb.close()
def settings(): settings_patch = { 'LAST_RUNS_PATH': '/tmp/last_runs/' } settings = get_project_settings() with patch.dict(settings, settings_patch): yield settings
def __init__(self): settings = get_project_settings() settings.set('LOG_ENABLED', False, priority='cmdline') #settings.overrides['LOG_ENABLED'] = False self.crawler = CrawlerProcess(settings) self.items = [] SignalManager(dispatcher.Any).connect(self._item_passed, signal=signals.item_scraped)
def setup_crawler(): spider = DmmDirectSpider(url=sys.argv[1]) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def setup_crawler(user, website, validator_set, parameters): spider = WebQualitySpider(user=user, website=website, validators=validator_set, parameters=parameters) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def setup_crawler(domain): spider = FollowAllSpider(domain=domain) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def crawler_start(usage, tasks): """Start specified spiders or validators from cmd with scrapy core api. There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't assign any tasks, all these spiders will run. """ maps = CRAWLER_TASK_MAPS if usage == 'crawler' else TEMP_TASK_MAPS origin_spiders = DEFAULT_CRAWLERS if usage == 'crawler' else DEFAULT_VALIDATORS if not tasks: spiders = origin_spiders else: spiders = list() cases = list(map(BaseCase, origin_spiders)) for task in tasks: for case in cases: if case.check(task, maps): spiders.append(case.spider) break else: # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format( # task, list(maps.keys()))) pass if not spiders: #crawler_logger.warning('no spider starts up, please check your task input') return settings = get_project_settings() configure_logging(settings) runner = CrawlerRunner(settings) for spider in spiders: runner.crawl(spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def run(self): settings = get_project_settings() process = CrawlerProcess(settings) process.crawl('stackoverflow', ) process.start()
def get(self): while True: process = CrawlerProcess(get_project_settings()) process.crawl('iqiyi') process.start() time.sleep(3000) self.finish()
def crawl_articles(spids): settings = get_project_settings() configure_logging(settings, install_root_handler=False) logging.getLogger('scrapy').setLevel(logging.WARNING) runner = CrawlerRunner(settings) loader = runner.spider_loader if 'all' in spids: spids = loader.list() spiders = [ loader.load(spid) for spid in spids if spid in loader.list() ] if not spiders: return random.shuffle(spiders) for spider in spiders: runner.crawl(spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) logger.info('crawl job starting...') try: reactor.run() except Exception: logger.exception('crawl job got exception:') logger.info('crawl job finished')
def startSpiderTest(group_type,spider_type,spider_group_name,spider_name): #调用Scrapy内部方法 settings = get_project_settings() #实例化一个爬虫进程 crawlerProcess = CrawlerProcess(settings) #创建一个爬虫,一个爬取处理器可以,运行多个爬取。 crawler = crawlerProcess.create_crawler(spider_name) #设置爬虫的状态。 当爬虫发出该信号后,调用响应的方法。 crawler.signals.connect(spiderSignal.startSingnal, signals.spider_opened) crawler.signals.connect(spiderSignal.errorSingnal, signals.spider_error) crawler.signals.connect(spiderSignal.stopSingnal, signals.spider_closed) #获取爬取类 spiderConf = Spider_Dict[group_type][spider_type] spiderArgs = spiderConf[1].copy() spiderArgs["name"] = spider_name spiderArgs["redis_key"] = spider_name spiderArgs["spider_type"] = spider_type spiderArgs["spider_group_name"] = spider_group_name spiderArgs["task_id"] = "-1" spider = spiderConf[0](**spiderArgs) #给爬虫设置爬取类 crawler.configure() crawler.crawl(spider) #爬虫启动。 crawlerProcess.start() crawlerProcess.stop()
def run(): options = { 'CONCURRENT_ITEMS': 250, #'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)', 'CONCURRENT_REQUESTS': 30, 'DOWNLOAD_DELAY': 0.5, 'COOKIES_ENABLED': False, } spider = EntertainmentcareersSpider() settings = get_project_settings() settings.update(options) runner= CrawlerRunner(settings) runner.crawl(spider) d= runner.join() d.addBoth(lambda _:reactor.stop()) #crawler = Crawler(settings) #crawler.signals.connect(reactor.stop, signal=signals.spider_closed) #crawler.install() #crawler.configure() #crawler.crawl(spider) #crawler.start() #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False) reactor.run()
def spiderCrawl(bandname): createLink(bandname) settings = get_project_settings() settings.set('USER_AGENT','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)') process = CrawlerProcess(settings) process.crawl(MySpider) process.start()
def _setup(self, project): spider = crawlspider.LinkSpider(project) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) self.add_crawler()
def setup_crawler(id="550", publisher="rbd"): spider = DmmQuerySpider(id, publisher) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def __init__(self, spider): Process.__init__(self) settings = get_project_settings() self.crawler = Crawler(settings) self.crawler.configure() self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) self.spider = spider
def __crawl(self, hiddenWebSite, localPort, extraPath='', crawlImages=True, crawlLinks=True,crawlContents=True, crawlFormData=True): def catch_item(sender, item, **kwargs): item['url'] = item['url'].replace('http://127.0.0.1:'+str(localPort)+extraPath, hiddenWebSite) print "[+]Processing URL %s ... " %(item['url']) from core.tortazo.databaseManagement.TortazoDatabase import TortazoDatabase database = TortazoDatabase() database.initDatabaseDeepWebCrawlerPlugin() self.__processPage(item, database) # setup crawler dispatcher.connect(catch_item, signal=signals.item_passed) dispatcher.connect(reactor.stop, signal=signals.spider_closed) settings = get_project_settings() settings.set('ITEM_PIPELINES', {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}, priority='cmdline') settings.set('IMAGES_STORE', config.deepWebCrawlerOutdir+hiddenWebSite) crawler = Crawler(settings) crawler.configure() spider = HiddenSiteSpider("http://127.0.0.1:"+str(localPort)+extraPath, hiddenWebSite, self.extractorRules) spider.setImages(crawlImages) spider.setLinks(crawlLinks) spider.setContents(crawlContents) spider.setForms(crawlFormData) crawler.crawl(spider) print "\n[+] Starting scrapy engine... this process could take some time, depending on the crawling and extractor rules applied... \n" crawler.start() reactor.run() print "[+] Crawler finished."
def __init__(self, spider): PageWriterBase.__init__(self, spider) if get_project_settings()['DEBUG_MODE']: self._init(300, 20000, '/tmp/crawler_baidu_hot/') self.set_name('RankingListWriterDebug') else: self._init(3600, 30000, '/letv/crawler_baidu_hot/') self.set_name('RankingListWriter') self._rank_type_map = {'实时热点': RankingListType.BaiduHotRealTime, '今日热点': RankingListType.BaiduHotToday, '七日热点': RankingListType.BaiduHot7Days, '民生热点': RankingListType.BaiduHotLife, '娱乐热点': RankingListType.BaiduHotPlay, '体育热点': RankingListType.BaiduHotSports, '百度电视剧榜': RankingListType.BaiduHotDrama, '百度电影榜': RankingListType.BaiduHotMovie, '百度动漫榜': RankingListType.BaiduHotComic, '百度综艺榜': RankingListType.BaiduHotVariety } self._ranking_lists = [] self._is_wrote = [] for i in range(len(self._rank_type_map.keys())): self._ranking_lists.append(RankingList()) self._is_wrote.append(False)
def run_crawler_by_runner(): runner = CrawlerRunner(get_project_settings()) [runner.crawl(spider) for spider in spiders] d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def load_site_conf(site): #pid_file = file("%s.pid" % site, "w+") #pid_file.write(os.getpid()) #pid_file.close() Utils.settings = get_project_settings() sites_rule = {} conf_dict = xmltodict.parse(file(r"%s" % (site)).read()) sites_rule = conf_dict['SiteRule']['Sites']['Site'] if not isinstance(sites_rule, list): sites_rule = [sites_rule] Conf.conf_dict = conf_dict Utils.conf_dict = conf_dict Conf.sites_rule = sites_rule Conf.ua = Conf.conf_dict['SiteRule'].get('UserAgent', '') Utils.get_mongodb_client() db = Utils.get_db() res = list(db.select(Utils.settings['MYSQL_TASKSTATUS_TABLE'], what="id", where="uuid=$uuid", vars={"uuid":conf_dict['SiteRule'].get('Uuid', "")})) Conf.uuid = int(conf_dict['SiteRule'].get('Uuid', 0)) Conf.task_id = int(res[0]['id']) if len(res) else int(conf_dict['SiteRule']['TaskId']) Conf.pid = int(conf_dict['SiteRule']['TaskId']) Conf.output_files = { ConfConstants.Output.XML:Conf.conf_dict['SiteRule']['XMLFileName'], ConfConstants.Output.JSON:Conf.conf_dict['SiteRule']['JsonFileName'], ConfConstants.Output.MongoDB:Conf.conf_dict['SiteRule']['TableName'], ConfConstants.Output.Hadoop:Conf.conf_dict['SiteRule']['Hadoop'] } Utils.daemon = ServiceDaemon()
def main(): """Index alexa demographics """ engine = db_connect() Session = sessionmaker(bind=engine) session = Session() settings = get_project_settings() settings.set('ITEM_PIPELINES', {'demographic_scraper.demographic_scraper.pipelines.WebsiteDemographicPipeline': 300}) settings.set('EXTENSIONS', {'scrapy.telnet.TelnetConsole': None,}) process = CrawlerProcess(settings) for website in session.query(WebsitesContent).all(): demographic = list(session.query(Websites).filter_by(link=website.link)) if len(demographic) is 0: url = website.link print website.link AlexaSpider.name = url process.crawl(AlexaSpider, url=url, db_session=session) process.start() process.stop() session.close()
def main(argv): try: opts, args = getopt.getopt(argv, "ch:t:s:", ['title=', 'section=']) except getopt.GetoptError: print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>' sys.exit() elif opt == '-c': # start crawling article here print "crawling" process = CrawlerProcess(get_project_settings()) process.crawl(BBCArticleSpider) process.start() elif opt in ('-t', '--title'): print "search by title" # start searching article by title results = BBCArticleItem.fetch_by_title(arg) for result in results: print result elif opt in ('-s', '--section'): print "search by section" # start searching article by section results = BBCArticleItem.fetch_by_section(arg) for result in results: print result
def fetch_data(self, mobile_number): request = scrapy.Request( url=crypt.get_posturl(), method='POST', body=crypt.get_poststr(mobile_number), headers={ 'X-CLIENT-PFM': '20', 'X-CLIENT-VCODE': '81', 'X-CLIENT-PID': '8888888', 'Content-Type': 'application/json; charset=utf-8', 'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 5.0.2; Redmi Note 2 MIUI/V7.5.5.0.LHMCNDE', 'Accept-Encoding': 'gzip', } ) request.meta['mobile'] = mobile_number request.meta['msk'] = crypt.sk request.meta['mtk'] = crypt.tk request.meta['muid'] = crypt.uid settings = get_project_settings() downloader = HTTP11DownloadHandler(settings) deferred = downloader.download_request(request) deferred.addCallback(self.parse_response, request) deferred.addErrback(self.parse_error) reactor.run()
def __init__(self, id_list, *args, **kwargs): super().__init__(*args, **kwargs) self.id_list = id_list configure_logging() self.runner = CrawlerRunner(get_project_settings())
def run_crawl(): process = CrawlerProcess(get_project_settings()) process.crawl('quotes') process.start()
def loop_crawl(): runner = CrawlerRunner(get_project_settings()) d = crawl(runner) d.addBoth(lambda _: crawl(runner)) reactor.run()
def run_crawler(url): process = CrawlerProcess(get_project_settings()) process.crawl(CzbooksSpider, url=url) process.start()
def main(): process = CrawlerProcess(get_project_settings()) process.crawl(PoeApiSpider) process.start()
class ZhiHuSpider(scrapy.Spider): name = "zhihu" start_urls = ['https://zhihu.com'] allowed_domains = ['www.zhihu.com'] setting = get_project_settings() headers = setting['DEFAULT_REQUEST_HEADERS'] post_data = setting['POST_DATA'] question_count = setting['QUESTION_COUNT'] answer_count = setting['ANSWER_COUNT_PER_QUESTION'] answer_offset = setting['ANSWER_OFFSET'] # 验证码的文字位置都是固定的 capacha_index = [[12.95, 14.969999999999998], [36.1, 16.009999999999998], [57.16, 24.44], [84.52, 19.17], [108.72, 28.64], [132.95, 24.44], [151.89, 23.380000000000002]] # 翻页请求问题相关 next_page = 'https://www.zhihu.com/api/v3/feed/topstory?action_feed=True&limit=10&' \ 'session_token={0}&action=down&after_id={1}&desktop=true' session_token = '' # 点击查看更多答案触发的url more_answer_url = 'https://www.zhihu.com/api/v4/questions/{0}/answers?include=data%5B*%5D.i' \ 's_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_actio' \ 'n%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_ed' \ 'it%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2' \ 'Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Crevie' \ 'w_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2' \ 'Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B*%5D.mark_infos%5B*%5D.ur' \ 'l%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.t' \ 'opics&offset={1}&limit={2}&sort_by=default' def start_requests(self): yield scrapy.Request('https://www.zhihu.com/', callback=self.login_zhihu) def login_zhihu(self, response): """ 获取xsrf及验证码图片 """ xsrf = re.findall(r'name="_xsrf" value="(.*?)"/>', response.text)[0] self.headers['X-Xsrftoken'] = xsrf self.post_data['_xsrf'] = xsrf times = re.findall( r'<script type="text/json" class="json-inline" data-n' r'ame="ga_vars">{"user_created":0,"now":(\d+),', response.text)[0] captcha_url = 'https://www.zhihu.com/' + 'captcha.gif?r=' + times + '&type=login&lang=cn' yield scrapy.Request(captcha_url, headers=self.headers, meta={'post_data': self.post_data}, callback=self.veri_captcha) def veri_captcha(self, response): """ 输入验证码信息进行登录 """ with open('captcha.jpg', 'wb') as f: f.write(response.body) print('只有一个倒立文字则第二个位置为0') loca1 = eval(input('input the loca 1:')) loca2 = eval(input('input the loca 2:')) captcha = self.location(int(loca1), int(loca2)) self.post_data = response.meta.get('post_data', {}) self.post_data['captcha'] = captcha post_url = 'https://www.zhihu.com/login/email' yield scrapy.FormRequest(post_url, formdata=self.post_data, headers=self.headers, callback=self.login_success) def location(self, a, b): """ 将输入的位置转换为相应信息 """ if b != 0: captcha = "{\"img_size\":[200,44],\"input_points\":[%s,%s]}" % ( str(self.capacha_index[a - 1]), str(self.capacha_index[b - 1])) else: captcha = "{\"img_size\":[200,44],\"input_points\":[%s]}" % str( self.capacha_index[a - 1]) return captcha def login_success(self, response): if 'err' in response.text: print((response.text)) print("error!!!!!!") else: print("successful!!!!!!") yield scrapy.Request('https://www.zhihu.com', headers=self.headers, dont_filter=True) def parse(self, response): """ 获取首页问题 """ question_urls = re.findall(r'https://www.zhihu.com/question/(\d+)', response.text) # 翻页用到的session_token 和 authorization都可在首页源代码找到 self.session_token = re.findall(r'session_token=([0-9,a-z]{32})', response.text)[0] auto = re.findall(r'carCompose":"(.*?)"', response.text)[0] self.headers['authorization'] = 'Bearer ' + auto # 首页第一页问题 for url in question_urls: question_detail = 'https://www.zhihu.com/question/' + url yield scrapy.Request(question_detail, headers=self.headers, callback=self.parse_question) # 获取指定数量问题 n = 10 while n < self.question_count: yield scrapy.Request(self.next_page.format(self.session_token, n), headers=self.headers, callback=self.get_more_question) n += 10 def parse_question(self, response): """ 解析问题详情及获取指定范围答案 """ text = response.text item = ZhihuQuestionItem() item['name'] = re.findall(r'<meta itemprop="name" content="(.*?)"', text)[0] item['url'] = re.findall(r'<meta itemprop="url" content="(.*?)"', text)[0] item['keywords'] = re.findall( r'<meta itemprop="keywords" content="(.*?)"', text)[0] item['answer_count'] = re.findall( r'<meta itemprop="answerCount" content="(.*?)"', text)[0] item['comment_count'] = re.findall( r'<meta itemprop="commentCount" content="(.*?)"', text)[0] item['flower_count'] = re.findall( r'<meta itemprop="zhihu:followerCount" content="(.*?)"', text)[0] item['date_created'] = re.findall( r'<meta itemprop="dateCreated" content="(.*?)"', text)[0] count_answer = int(item['answer_count']) yield item question_id = int( re.match(r'https://www.zhihu.com/question/(\d+)', response.url).group(1)) # 从指定位置开始获取指定数量答案 if count_answer > self.answer_count: count_answer = self.answer_count n = self.answer_offset while n + 20 <= count_answer: yield scrapy.Request(self.more_answer_url.format( question_id, n, n + 20), headers=self.headers, callback=self.parse_answer) n += 20 def get_more_question(self, response): """ 获取更多首页问题 """ question_url = 'https://www.zhihu.com/question/{0}' questions = json.loads(response.text) for que in questions['data']: question_id = re.findall(r'(\d+)', que['target']['question']['url'])[0] yield scrapy.Request(question_url.format(question_id), headers=self.headers, callback=self.parse_question) def parse_answer(self, response): """ 解析获取到的指定范围答案 """ answers = json.loads(response.text) for ans in answers['data']: item = ZhihuAnswerItem() item['question_id'] = re.match( r'http://www.zhihu.com/api/v4/questions/(\d+)', ans['question']['url']).group(1) item['author'] = ans['author']['name'] item['ans_url'] = ans['url'] item['comment_count'] = ans['comment_count'] item['upvote_count'] = ans['voteup_count'] item['excerpt'] = ans['excerpt'] yield item
def main(): configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) process = CrawlerProcess(get_project_settings()) crawler = PageCrawler() process.crawl(crawler) process.start()
def spider_task(): process = CrawlerProcess(get_project_settings()) process.crawl(HotmovieSpider) process.start()
def connect_db(): s = get_project_settings() return create_engine(URL(**s['DATABASE']))
def run_crawler(): process = CrawlerProcess(get_project_settings()) process.crawl('amazon') process.start()
def __init__(self): self.engine = create_engine( get_project_settings().get("CONNECTION_STRING")) self.Session = sessionmaker(bind=self.engine)
def main(): process = CrawlerProcess(get_project_settings()) process.crawl(SolutionSpider) process.start()
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import json import scrapy # from scrapy.contrib.pipeline.images import ImagesPipeline from scrapy.pipelines.images import ImagesPipeline #DropItem 丢弃item from scrapy.exceptions import DropItem from scrapy.utils.project import get_project_settings import os # os.rename(源文件的名称,新的名称) #获取图片存储的文件夹路径 image_store = get_project_settings().get('IMAGES_STORE') #图片下载管道文件 class MyImagesPipeline(ImagesPipeline): # def file_path(self, request, response=None, info=None): # # 可以返回一个图片的路径 # #return 'full/%s.jpg' % (image_guid) # return 'full/%s.jpg' % (图片的名称) def get_media_requests(self, item, info): # 第一步从item中获取要下载的图片地址 # for image_url in item['image_urls']: # #根据图片的 url地址构建一个Request对象, # #最终交给调度器
def db_connect(): """ Performs database connection using database settings from settings.py. Returns sqlalchemy engine instance """ return create_engine(get_project_settings().get("CONNECTION_STRING"))
def handle(self, *args, **options): process = CrawlerProcess(get_project_settings()) process.crawl(gocovid) process.start()
def from_crawler(cls, crawler): settings = get_project_settings() return cls(mongo_uri=crawler.settings.get('MONGODB_URI'), mongo_db=settings.get('MONGODB_DATABASE', 'items'))
def start_requests(self): settings = get_project_settings() for offset in range(2300): yield scrapy.Request(url='https://finance.yahoo.com/screener/unsaved/4b6788ec-db90-477a-a44a-09e5cd5b5027?count=1&offset=' + str(offset), callback=self.parse)
def handle(self, *args, **options): process = CrawlerProcess(get_project_settings()) process.crawl(ParliamentMemberSpider) process.start()
def outletpccatalog(): print("fent catàleg...") process = CrawlerProcess(get_project_settings()) process.crawl(OutletpcCataleg) process.start() # the script will block here until the crawling is finished
def scrawl(): process = CrawlerProcess(get_project_settings()) process.crawl('posts-spider') process.start()
def db_connect(): return create_engine(get_project_settings().get("CONNECTION_STRING"))
import os import time import re from scrapy.utils.project import get_project_settings from spiders.selecrawlers.baseselespider import BaseSeleSpider from datetime import datetime from selenium.webdriver.common.keys import Keys from wangban_utils.single_mode import singleton from spiders.basemodel import DIYBaseSpider SETTINGS = get_project_settings() JSONFILE = os.path.join(SETTINGS['BASE_JSONFILE_PATH'],'changxing.json') @singleton class ChangXingSeleSpider(BaseSeleSpider): name = 'changxing' def __init__(self): super().__init__() self.post_suf = '__page_{}' self.source_url ='http://ggzy.zjcx.gov.cn:8081/cxweb/' def get_totalpage(self,driver): #获取总页数,没有总页数,设置总页数为1 try: total_page = driver.find_element_by_xpath(self.xp.total_page).text total_page = int(total_page) except Exception as e: total_page = '1' print('get total error',e) print(driver.current_url)
class LikeSpider(scrapy.Spider): name = 'like' allowed_domains = ['yande.re'] setting = get_project_settings() # 配置文件,用来取值 table = pymongo.MongoClient(setting['MONGO_URL'])[setting['MONGO_DB']][ setting['MONGO_YANDE_POST_DB']] # rules = ( # Rule(LinkExtractor(allow=r'/post\?page=\d+&tags=vote%3A%3E%3D1%3Akamiyamashiki\+order%3Avote'), callback='parse_list', follow=True), # ) # https://yande.re/post?page=1&tags=vote%3A%3E%3D1%3Akamiyamashiki+order%3Avote # /post?page=2&tags=vote%3A%3E%3D1%3Akamiyamashiki+order%3Avote def start_requests(self): yield scrapy.Request('https://yande.re/user/login', callback=self.login) def login(self, response): form_data = { 'user[name]': self.setting['YANDE_USERNAME'], 'user[password]': self.setting['YANDE_PASSWORD'] } yield FormRequest.from_response(response, formdata=form_data, callback=self.after_login) def after_login(self, response): yield scrapy.Request('https://yande.re/post?tags=vote:' + self.setting['YANDE_VOTE'] + ':' + self.setting['YANDE_USERNAME'] + ' order:vote', callback=self.parse_list) def parse_list(self, response): item = {} #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['name'] = response.xpath('//div[@id="name"]').get() #item['description'] = response.xpath('//div[@id="description"]').get() print(response.url) print(response.body) for url_selector in response.xpath( '//li//div[@class="inner"]/a[@class="thumb"]/@href'): url = url_selector.get() pic_id = int(url.replace('/post/show/', '')) if not self.table.find_one({'pic_id': pic_id}): yield scrapy.Request('https://' + self.allowed_domains[0] + url, callback=self.parse_pic) else: print('#####图片 ' + str(pic_id) + ' 已存在#####') # pass if response.xpath('//a[@class="next_page"]/@href').get(): next_page = 'https://' + self.allowed_domains[0] + response.xpath( '//a[@class="next_page"]/@href').get() yield scrapy.Request(next_page, callback=self.parse_list) def parse_pic(self, response): print('开始处理' + response.url) pic = Pic() pic['unchanged_pic_url'] = response.xpath( '//a[@class="original-file-unchanged"]/@href').extract_first() pic['changed_pic_url'] = response.xpath( '//a[@class="original-file-changed"]/@href').extract_first() pic['pic_id'] = int( response.xpath( '//*[@id="stats"]/ul/li[1]/text()').extract_first().replace( 'Id: ', '')) pic['rating'] = response.xpath( '//li[contains(./text(), "Rating")]/text()').get().replace( 'Rating: ', '') pic['size'] = response.xpath( '//*[@id="stats"]/ul/li[contains(./text(), "Size")]/text()').get( ).replace('Size: ', '') pic['source'] = response.xpath( '//*[@id="stats"]/ul/li[contains(./text(), "Source")]/a/@href' ).get() pic['date'] = datetime.datetime.now() # pic['tags'] = '■'.join(response.xpath('//ul[@id="tag-sidebar"]/li/a[contains(./@href, "/post?tags=") and not(@class)]/text()').extract()) pic['tags'] = response.xpath( '//ul[@id="tag-sidebar"]/li/a[contains(./@href, "/post?tags=") and not(@class)]/text()' ).extract() parent = response.xpath( '//div/a[contains(text(), "parent post")]/@href').get() if parent: pic['parent_pic_id'] = int(parent.replace('/post/show/', '')) if response.xpath('//p/span').get(): pic['pool'] = response.xpath( '//p/span/following-sibling::*/text()').get() pic['pool_id'] = int( response.xpath( '//p/span/following-sibling::*/@href').get().replace( '/pool/show/', '')) pic['pool_seq'] = response.xpath('//p/span/text()').get().replace( '#', '') yield pic
def handle(self, *args, **options): process = CrawlerProcess(get_project_settings()) process.crawl(nba.NbaSpider) process.start()
def process_request(self, request: Any, spider: scrapy.Spider) -> None: settings = get_project_settings() request.headers.setdefault('User-Agent', random.choice(settings['USER_AGENTS'])) return None
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.settings = get_project_settings() self._proxy_lis = self.proxies
import logging from scrapy.utils.project import get_project_settings from scrapy.spidermiddlewares.httperror import HttpError from twisted.internet.error import DNSLookupError from twisted.internet.error import TimeoutError, TCPTimedOutError logger = logging.getLogger(__name__) try: from scrapy_mysql import connection except: print("Module is not installed : scrapy_mysql") raise settings = get_project_settings() url_server = connection.from_settings(settings) HTTP_ERROR_CODE = -1 DNS_ERROR_CODE = -2 TIMEOUT_CODE = -3 UNKNOWN_HTTP_ERROR = -4 class ResponseErrMiddleware(object): def process_exception(self, request, exception, spider): url = request.url id = getattr(request, "id", None) if id is None: logger.error("No [id] in Request of url: [{}]".format(url))
from medicalDataSpider.spiders.so39 import So39Spider from medicalDataSpider.spiders.shiguanzhijia import ShiguanZhijiaSpider from medicalDataSpider.spiders.jianshu import JianshuSpider from medicalDataSpider.spiders.haodaifu import HaodaifuSpider from medicalDataSpider.spiders.fh21 import Fh21Spider from medicalDataSpider.spiders.bozhong import BozhongSpider from medicalDataSpider.spiders.babytree import BabytreeSpider from scrapy.crawler import CrawlerProcess from twisted.internet import reactor, defer from scrapy.crawler import CrawlerRunner from scrapy.utils.project import get_project_settings from medicalDataSpider.spiders.ask120 import Ask120Spider from scrapy.utils.log import configure_logging from keywords import keywords from short_keywords import short_keywords configure_logging() process = CrawlerRunner(get_project_settings()) @defer.inlineCallbacks def crawl(): for keyword in keywords: yield process.crawl(HaodaifuSpider, keyword) reactor.stop() crawl() reactor.run()
from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings if __name__ == '__main__': process = CrawlerProcess(get_project_settings()) process.crawl('hupu') # 你需要将此处的spider_name替换为你自己的爬虫名称 process.start()
from scrapy.crawler import CrawlerProcess from scrapy.utils import project from myScrapy.spiders.ACG12 import ACG12 from myScrapy.spiders.QiWenBeauty import QiWenBeauty spiders = { QiWenBeauty.name: True, ACG12.name: True } if __name__ == '__main__': setting = project.get_project_settings() process = CrawlerProcess(setting) for spider_name in process.spiders.list(): if spiders.get(spider_name): print("启动爬虫 >>> %s <<<" % spider_name) process.crawl(spider_name) process.start()