def __init__(self, spider, bufsize=4096):
    super(RealtimeVideoWriter, self).__init__(spider, bufsize)
    self.set_name('RealtimeVideoWriter')
    from ..base.start_url_loads import StartUrlsLoader

    self.extend_map_handler_ = ExtendMapHandler.get_instance(
      StartUrlsLoader.get_instance('../start_urls/'))
    self.using_links_extract_ = get_project_settings().get('USING_EXTRACT_LINKS', False)
    self.new_links_extract = LinksExtractor('le_crawler.common.cdesktop_settings')

    # rabittmq
    self.ips_ = ['10.150.140.78', '10.150.140.77', '10.150.140.79']
    self.port_ = 5672
    self.exchange = 'hbase.exchange'
    self.queue = 'hbase.search2.realtime.queue'
    self.channel = self._get_channel()
    assert self.channel
    # othres
    self.total_send = 0
    self.debug = get_project_settings()['DEBUG_MODEL']

    self.tbl_n = 'crawler_video'
    self.connect_ = None
    self.mysql_host_ = '10.150.140.80'
    self.mysql_port_ = 3306
    self.mysql_passwd_ = 'search@letv'
    self.mysql_usr_ = 'search'
    self.mysql_db_ = 'crawler'
示例#2
0
 def __init__(self, spider):
   PageWriterBase.__init__(self, spider)
   self._init(
         get_project_settings().getint('LOCAL_PAGE_WRITER_DATA_TIME_LIMIT', 86400),
         get_project_settings().getint('LOCAL_PAGE_WRITER_DATA_FLUSH_LIMIT', 20000),
         get_project_settings().get('LOCAL_PAGE_WRITER_DATA_DIR', '/letv/crawler_delta/')
         )
   self.set_name('PageLocalJsonWriter')
示例#3
0
def mongodb():
        dbname = get_project_settings().get('MONGO_PIPELINE_DBNAME', 'scrapyh')
        dbhost = get_project_settings().get('MONGO_PIPELINE_HOST', 'localhost')
        clt  = MongoClient(dbhost)
        db = clt[dbname]
        try:
                yield db
        finally:
                clt.close()
示例#4
0
 def __init__(self, spider):
   PageWriterBase.__init__(self, spider)
   if get_project_settings()['DEBUG_MODEL']:
     self._init(86400, 1000, '/tmp/crawler_delta/')
     self.set_name('PageLocalJsonWriterTest')
   else:
     self._init(
         get_project_settings().getint('LOCAL_PAGE_WRITER_DATA_TIME_LIMIT', 86400),
         get_project_settings().getint('LOCAL_PAGE_WRITER_DATA_FLUSH_LIMIT', 20000),
         get_project_settings().get('LOCAL_PAGE_WRITER_DATA_DIR', '/letv/crawler_delta/')
         )
     self.set_name('PageLocalJsonWriter')
示例#5
0
 def __init__(self, spider):
   super(CDesktopWriter, self).__init__(spider)
   self.set_name('CDesktopWriter')
   self.data_reciver_ip = '10.180.92.206'
   self.data_port = 10086
   self.total_send_count = 0
   self.type_c = get_project_settings().get('CD_WRITE_TYPE', 'http')
   self.post_url = get_project_settings().get('CD_WRITE_POST_URL',
       'http://10.180.92.206:9998/bigdata/post/webpage')
   self.connection = None
   self.__create_data_pipe()
   self.retry_time_max = 10
示例#6
0
    def __prepare_seed_list(self, tournament, year, month):
        whoscored_feed_url = get_project_settings().get('WHOSCORED_FEED_URL')

        if tournament and year and month:
            return [whoscored_feed_url % (tournament, year, month.zfill(2))]

        tournaments = get_project_settings().get('TOURNAMENTS')
        years = get_project_settings().get('TOURNAMENT_YEARS')

        dates = [(years[0], month) for month in xrange(06, 12)]
        #dates.extend([(years[1], month) for month in xrange(01, 06)])

        return [whoscored_feed_url % (tournament, year, str(month).zfill(2)) for tournament in tournaments for (year, month) in dates]
示例#7
0
	def __init__(self):
		""" Initialise la connexion au serveur MySQL """
		settings = get_project_settings()
		self.conn = MySQLdb.connect(user=settings.get('DB_USER'), 
									passwd=settings.get('DB_PASSWD'), 
									db=settings.get('DB_DATABASE'), 
									host=settings.get('DB_HOST'), charset="utf8", use_unicode=True)
		self.cursor = self.conn.cursor()

		pdfPath = get_project_settings().get('PDF_DIR')
		self.pdfPath = pdfPath

		if not os.path.isdir(pdfPath): # vérification de l'existance du répertoire contenant les PDF.
			os.makedirs(pdfPath) # on créé le répertoire.
		self.fullRepositoryExist = False
    def initStartUrls(self):
#         self.kkcardb = sqlite3.connect(self.dbName)
#         self.cursor  = self.kkcardb.cursor()
        #连接MySQL
        settings = get_project_settings()        
        host   = settings.get('MYSQL_HOST')
        port   = settings.get('MYSQL_PORT')
        user   = settings.get('MYSQL_USER')
        passwd = settings.get('MYSQL_PASSWD')
        dbName = settings.get('MYSQL_DBNAME')

        dbTool = connMySQL(host, int(port), user, passwd, dbName)
        
        self.kkcardb = dbTool[0]
        self.cursor  = dbTool[1]

        #sqlite3中方法
#         seriesIdList = self.cursor.execute('select id from t_bx_car_series')
#         
#         for seriesId in seriesIdList:
#             start_url = 'http://www.16888.com/'+str(seriesId[0])
#             self.start_urls.append(start_url)
#             print('start_urls='+start_url)

        #mysql方法
        count = self.cursor.execute('select id from t_bx_car_series')
        print('count='+str(count))
        seriesList = self.cursor.fetchall()
        for seriesId in seriesList:
            start_url = 'http://www.16888.com/'+str(seriesId[0])
            self.start_urls.append(start_url)
            print('start_urls='+start_url)

        self.cursor.close()
        self.kkcardb.close()
示例#9
0
def settings():
    settings_patch = {
        'LAST_RUNS_PATH': '/tmp/last_runs/'
    }
    settings = get_project_settings()
    with patch.dict(settings, settings_patch):
        yield settings
示例#10
0
 def __init__(self):
     settings = get_project_settings()
     settings.set('LOG_ENABLED', False, priority='cmdline')
     #settings.overrides['LOG_ENABLED'] = False
     self.crawler = CrawlerProcess(settings)
     self.items = []
     SignalManager(dispatcher.Any).connect(self._item_passed, signal=signals.item_scraped)
示例#11
0
def setup_crawler():
    spider = DmmDirectSpider(url=sys.argv[1])
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
示例#12
0
def setup_crawler(user, website, validator_set, parameters):
    spider = WebQualitySpider(user=user, website=website, validators=validator_set, parameters=parameters)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
示例#13
0
文件: run.py 项目: wac81/experience
def setup_crawler(domain):
    spider = FollowAllSpider(domain=domain)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
示例#14
0
def crawler_start(usage, tasks):
    """Start specified spiders or validators from cmd with scrapy core api.
    There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't
    assign any tasks, all these spiders will run.
    """
    maps = CRAWLER_TASK_MAPS if usage == 'crawler' else TEMP_TASK_MAPS
    origin_spiders = DEFAULT_CRAWLERS if usage == 'crawler' else DEFAULT_VALIDATORS
    if not tasks:
        spiders = origin_spiders
    else:
        spiders = list()
        cases = list(map(BaseCase, origin_spiders))
        for task in tasks:
            for case in cases:
                if case.check(task, maps):
                    spiders.append(case.spider)
                    break
            else:
                # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
                #     task, list(maps.keys())))
                pass
    if not spiders:
        #crawler_logger.warning('no spider starts up, please check your task input')
        return

    settings = get_project_settings()
    configure_logging(settings)
    runner = CrawlerRunner(settings)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
    def run(self):
        settings = get_project_settings()
        process = CrawlerProcess(settings)

        process.crawl('stackoverflow',
                      )
        process.start()
示例#16
0
 def get(self):
     while True:
         process = CrawlerProcess(get_project_settings())
         process.crawl('iqiyi')
         process.start()
         time.sleep(3000)
     self.finish()
示例#17
0
def crawl_articles(spids):
    settings = get_project_settings()
    configure_logging(settings, install_root_handler=False)
    logging.getLogger('scrapy').setLevel(logging.WARNING)
    runner = CrawlerRunner(settings)
    loader = runner.spider_loader
    if 'all' in spids:
        spids = loader.list()
    spiders = [
        loader.load(spid)
        for spid in spids
        if spid in loader.list()
    ]
    if not spiders:
        return
    random.shuffle(spiders)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    logger.info('crawl job starting...')
    try:
        reactor.run()
    except Exception:
        logger.exception('crawl job got exception:')
    logger.info('crawl job finished')
示例#18
0
def startSpiderTest(group_type,spider_type,spider_group_name,spider_name):
    #调用Scrapy内部方法
    settings = get_project_settings()
    #实例化一个爬虫进程
    crawlerProcess = CrawlerProcess(settings)

    #创建一个爬虫,一个爬取处理器可以,运行多个爬取。
    crawler = crawlerProcess.create_crawler(spider_name)

    #设置爬虫的状态。 当爬虫发出该信号后,调用响应的方法。
    crawler.signals.connect(spiderSignal.startSingnal, signals.spider_opened)
    crawler.signals.connect(spiderSignal.errorSingnal, signals.spider_error)
    crawler.signals.connect(spiderSignal.stopSingnal, signals.spider_closed)

    #获取爬取类
    spiderConf = Spider_Dict[group_type][spider_type]
    spiderArgs = spiderConf[1].copy()
    spiderArgs["name"] = spider_name
    spiderArgs["redis_key"] = spider_name
    spiderArgs["spider_type"] = spider_type
    spiderArgs["spider_group_name"] = spider_group_name
    spiderArgs["task_id"] = "-1"

    spider = spiderConf[0](**spiderArgs)

    #给爬虫设置爬取类
    crawler.configure()
    crawler.crawl(spider)

    #爬虫启动。
    crawlerProcess.start()
    crawlerProcess.stop()
def run():
    options = {
        'CONCURRENT_ITEMS': 250,
        #'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_DELAY': 0.5,
        'COOKIES_ENABLED': False,
        }

    spider = EntertainmentcareersSpider()

    settings = get_project_settings()
    settings.update(options)

    runner= CrawlerRunner(settings)
    runner.crawl(spider)

    d= runner.join()
    d.addBoth(lambda _:reactor.stop())
    #crawler = Crawler(settings)
    #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    #crawler.install()
    #crawler.configure()
    #crawler.crawl(spider)
    #crawler.start()
    #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
    reactor.run()
def spiderCrawl(bandname):
   createLink(bandname)
   settings = get_project_settings()
   settings.set('USER_AGENT','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)')
   process = CrawlerProcess(settings)
   process.crawl(MySpider)
   process.start()
示例#21
0
 def _setup(self, project):
     spider = crawlspider.LinkSpider(project)
     settings = get_project_settings()
     crawler = Crawler(settings)
     crawler.configure()
     crawler.crawl(spider)
     self.add_crawler()
示例#22
0
def setup_crawler(id="550", publisher="rbd"):
    spider = DmmQuerySpider(id, publisher)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
示例#23
0
 def __init__(self, spider):
     Process.__init__(self)
     settings = get_project_settings()
     self.crawler = Crawler(settings)
     self.crawler.configure()
     self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
     self.spider = spider
示例#24
0
    def __crawl(self, hiddenWebSite, localPort, extraPath='', crawlImages=True, crawlLinks=True,crawlContents=True, crawlFormData=True):
        def catch_item(sender, item, **kwargs):
            item['url'] = item['url'].replace('http://127.0.0.1:'+str(localPort)+extraPath, hiddenWebSite)
            print "[+]Processing URL %s ...  " %(item['url'])
            from core.tortazo.databaseManagement.TortazoDatabase import TortazoDatabase
            database = TortazoDatabase()
            database.initDatabaseDeepWebCrawlerPlugin()
            self.__processPage(item, database)

        # setup crawler
        dispatcher.connect(catch_item, signal=signals.item_passed)
        dispatcher.connect(reactor.stop, signal=signals.spider_closed)

        settings = get_project_settings()
        settings.set('ITEM_PIPELINES', {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}, priority='cmdline')
        settings.set('IMAGES_STORE', config.deepWebCrawlerOutdir+hiddenWebSite)

        crawler = Crawler(settings)
        crawler.configure()
        spider = HiddenSiteSpider("http://127.0.0.1:"+str(localPort)+extraPath, hiddenWebSite, self.extractorRules)
        spider.setImages(crawlImages)
        spider.setLinks(crawlLinks)
        spider.setContents(crawlContents)
        spider.setForms(crawlFormData)

        crawler.crawl(spider)
        print "\n[+] Starting scrapy engine... this process could take some time, depending on the crawling and extractor rules applied... \n"
        crawler.start()
        reactor.run()
        print "[+] Crawler finished."
示例#25
0
  def __init__(self, spider):
    PageWriterBase.__init__(self, spider)
    if get_project_settings()['DEBUG_MODE']:
      self._init(300, 20000, '/tmp/crawler_baidu_hot/')
      self.set_name('RankingListWriterDebug')
    else:
      self._init(3600, 30000, '/letv/crawler_baidu_hot/')
      self.set_name('RankingListWriter')

    self._rank_type_map = {'实时热点': RankingListType.BaiduHotRealTime,
                           '今日热点': RankingListType.BaiduHotToday,
                           '七日热点': RankingListType.BaiduHot7Days,
                           '民生热点': RankingListType.BaiduHotLife,
                           '娱乐热点': RankingListType.BaiduHotPlay,
                           '体育热点': RankingListType.BaiduHotSports,
                           '百度电视剧榜': RankingListType.BaiduHotDrama,
                           '百度电影榜': RankingListType.BaiduHotMovie,
                           '百度动漫榜': RankingListType.BaiduHotComic,
                           '百度综艺榜': RankingListType.BaiduHotVariety
                          }
    self._ranking_lists = []
    self._is_wrote = []
    for i in range(len(self._rank_type_map.keys())):
      self._ranking_lists.append(RankingList())
      self._is_wrote.append(False)
示例#26
0
def run_crawler_by_runner():
    runner = CrawlerRunner(get_project_settings())
    
    [runner.crawl(spider) for spider in spiders]
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
示例#27
0
def load_site_conf(site):
    #pid_file = file("%s.pid" % site, "w+")
    #pid_file.write(os.getpid())
    #pid_file.close()
    Utils.settings = get_project_settings()
    sites_rule = {}
    conf_dict = xmltodict.parse(file(r"%s" % (site)).read())
    sites_rule = conf_dict['SiteRule']['Sites']['Site']
    if not isinstance(sites_rule, list):
        sites_rule = [sites_rule]
    Conf.conf_dict = conf_dict
    Utils.conf_dict = conf_dict
    Conf.sites_rule = sites_rule
    Conf.ua = Conf.conf_dict['SiteRule'].get('UserAgent', '')
    Utils.get_mongodb_client()
    db = Utils.get_db()
    res = list(db.select(Utils.settings['MYSQL_TASKSTATUS_TABLE'], what="id", where="uuid=$uuid", vars={"uuid":conf_dict['SiteRule'].get('Uuid', "")}))
    Conf.uuid = int(conf_dict['SiteRule'].get('Uuid', 0))
    Conf.task_id = int(res[0]['id']) if len(res) else int(conf_dict['SiteRule']['TaskId'])
    Conf.pid = int(conf_dict['SiteRule']['TaskId'])
    Conf.output_files = {
        ConfConstants.Output.XML:Conf.conf_dict['SiteRule']['XMLFileName'],
        ConfConstants.Output.JSON:Conf.conf_dict['SiteRule']['JsonFileName'],
        ConfConstants.Output.MongoDB:Conf.conf_dict['SiteRule']['TableName'],
        ConfConstants.Output.Hadoop:Conf.conf_dict['SiteRule']['Hadoop']
    }
    Utils.daemon = ServiceDaemon()
示例#28
0
def main():
    """Index alexa demographics
    """

    engine = db_connect()
    Session = sessionmaker(bind=engine)
    session = Session()

    settings = get_project_settings()
    settings.set('ITEM_PIPELINES',
                 {'demographic_scraper.demographic_scraper.pipelines.WebsiteDemographicPipeline': 300})
    settings.set('EXTENSIONS',
                 {'scrapy.telnet.TelnetConsole': None,})


    process = CrawlerProcess(settings)
    for website in session.query(WebsitesContent).all():
        demographic = list(session.query(Websites).filter_by(link=website.link))
        if len(demographic) is 0:
            url = website.link
            print website.link
            AlexaSpider.name = url
            process.crawl(AlexaSpider, url=url, db_session=session)
    process.start()
    process.stop()

    session.close()
示例#29
0
文件: decc.py 项目: CharlesNie/DECC
def main(argv):

	try:
		opts, args = getopt.getopt(argv, "ch:t:s:", ['title=', 'section='])
	except getopt.GetoptError:
		print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>'
			sys.exit()
		elif opt == '-c':
			# start crawling article here
			print "crawling"
			process = CrawlerProcess(get_project_settings())
			process.crawl(BBCArticleSpider)
			process.start()
		elif opt in  ('-t', '--title'):
			print "search by title"
			# start searching article by title
			results = BBCArticleItem.fetch_by_title(arg)
			for result in results:
				print result
		elif opt in ('-s', '--section'):
			print "search by section"
			# start searching article by section
			results = BBCArticleItem.fetch_by_section(arg)
			for result in results:
				print result
    def fetch_data(self, mobile_number):
        request = scrapy.Request(
            url=crypt.get_posturl(),
            method='POST',
            body=crypt.get_poststr(mobile_number),
            headers={
                'X-CLIENT-PFM': '20',
                'X-CLIENT-VCODE': '81',
                'X-CLIENT-PID': '8888888',
                'Content-Type': 'application/json; charset=utf-8',
                'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 5.0.2; Redmi Note 2 MIUI/V7.5.5.0.LHMCNDE',
                'Accept-Encoding': 'gzip',
            }
        )
        request.meta['mobile'] = mobile_number
        request.meta['msk'] = crypt.sk
        request.meta['mtk'] = crypt.tk
        request.meta['muid'] = crypt.uid

        settings = get_project_settings()
        downloader = HTTP11DownloadHandler(settings)
        deferred = downloader.download_request(request)
        deferred.addCallback(self.parse_response, request)
        deferred.addErrback(self.parse_error)
        reactor.run()
示例#31
0
    def __init__(self, id_list, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.id_list = id_list

        configure_logging()
        self.runner = CrawlerRunner(get_project_settings())
示例#32
0
def run_crawl():
    process = CrawlerProcess(get_project_settings())
    process.crawl('quotes')
    process.start()
示例#33
0
def loop_crawl():
    runner = CrawlerRunner(get_project_settings())
    d = crawl(runner)
    d.addBoth(lambda _: crawl(runner))
    reactor.run()
示例#34
0
def run_crawler(url):
    process = CrawlerProcess(get_project_settings())
    process.crawl(CzbooksSpider, url=url)
    process.start()
示例#35
0
def main():
    process = CrawlerProcess(get_project_settings())
    process.crawl(PoeApiSpider)
    process.start()
示例#36
0
class ZhiHuSpider(scrapy.Spider):

    name = "zhihu"
    start_urls = ['https://zhihu.com']
    allowed_domains = ['www.zhihu.com']

    setting = get_project_settings()
    headers = setting['DEFAULT_REQUEST_HEADERS']
    post_data = setting['POST_DATA']
    question_count = setting['QUESTION_COUNT']
    answer_count = setting['ANSWER_COUNT_PER_QUESTION']
    answer_offset = setting['ANSWER_OFFSET']

    # 验证码的文字位置都是固定的
    capacha_index = [[12.95, 14.969999999999998], [36.1, 16.009999999999998],
                     [57.16, 24.44], [84.52, 19.17], [108.72, 28.64],
                     [132.95, 24.44], [151.89, 23.380000000000002]]

    # 翻页请求问题相关
    next_page = 'https://www.zhihu.com/api/v3/feed/topstory?action_feed=True&limit=10&' \
                'session_token={0}&action=down&after_id={1}&desktop=true'
    session_token = ''

    # 点击查看更多答案触发的url
    more_answer_url = 'https://www.zhihu.com/api/v4/questions/{0}/answers?include=data%5B*%5D.i' \
                      's_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_actio' \
                      'n%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_ed' \
                      'it%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2' \
                      'Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Crevie' \
                      'w_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2' \
                      'Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B*%5D.mark_infos%5B*%5D.ur' \
                      'l%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.t' \
                      'opics&offset={1}&limit={2}&sort_by=default'

    def start_requests(self):

        yield scrapy.Request('https://www.zhihu.com/',
                             callback=self.login_zhihu)

    def login_zhihu(self, response):
        """ 获取xsrf及验证码图片 """
        xsrf = re.findall(r'name="_xsrf" value="(.*?)"/>', response.text)[0]
        self.headers['X-Xsrftoken'] = xsrf
        self.post_data['_xsrf'] = xsrf

        times = re.findall(
            r'<script type="text/json" class="json-inline" data-n'
            r'ame="ga_vars">{"user_created":0,"now":(\d+),', response.text)[0]
        captcha_url = 'https://www.zhihu.com/' + 'captcha.gif?r=' + times + '&type=login&lang=cn'

        yield scrapy.Request(captcha_url,
                             headers=self.headers,
                             meta={'post_data': self.post_data},
                             callback=self.veri_captcha)

    def veri_captcha(self, response):
        """ 输入验证码信息进行登录 """
        with open('captcha.jpg', 'wb') as f:
            f.write(response.body)

        print('只有一个倒立文字则第二个位置为0')
        loca1 = eval(input('input the loca 1:'))
        loca2 = eval(input('input the loca 2:'))
        captcha = self.location(int(loca1), int(loca2))

        self.post_data = response.meta.get('post_data', {})
        self.post_data['captcha'] = captcha
        post_url = 'https://www.zhihu.com/login/email'

        yield scrapy.FormRequest(post_url,
                                 formdata=self.post_data,
                                 headers=self.headers,
                                 callback=self.login_success)

    def location(self, a, b):
        """ 将输入的位置转换为相应信息 """
        if b != 0:
            captcha = "{\"img_size\":[200,44],\"input_points\":[%s,%s]}" % (
                str(self.capacha_index[a - 1]), str(self.capacha_index[b - 1]))
        else:
            captcha = "{\"img_size\":[200,44],\"input_points\":[%s]}" % str(
                self.capacha_index[a - 1])
        return captcha

    def login_success(self, response):

        if 'err' in response.text:
            print((response.text))
            print("error!!!!!!")
        else:
            print("successful!!!!!!")
            yield scrapy.Request('https://www.zhihu.com',
                                 headers=self.headers,
                                 dont_filter=True)

    def parse(self, response):
        """ 获取首页问题 """
        question_urls = re.findall(r'https://www.zhihu.com/question/(\d+)',
                                   response.text)

        # 翻页用到的session_token 和 authorization都可在首页源代码找到
        self.session_token = re.findall(r'session_token=([0-9,a-z]{32})',
                                        response.text)[0]
        auto = re.findall(r'carCompose&quot;:&quot;(.*?)&quot',
                          response.text)[0]
        self.headers['authorization'] = 'Bearer ' + auto

        # 首页第一页问题
        for url in question_urls:
            question_detail = 'https://www.zhihu.com/question/' + url
            yield scrapy.Request(question_detail,
                                 headers=self.headers,
                                 callback=self.parse_question)

        # 获取指定数量问题
        n = 10
        while n < self.question_count:
            yield scrapy.Request(self.next_page.format(self.session_token, n),
                                 headers=self.headers,
                                 callback=self.get_more_question)
            n += 10

    def parse_question(self, response):
        """ 解析问题详情及获取指定范围答案 """
        text = response.text
        item = ZhihuQuestionItem()

        item['name'] = re.findall(r'<meta itemprop="name" content="(.*?)"',
                                  text)[0]
        item['url'] = re.findall(r'<meta itemprop="url" content="(.*?)"',
                                 text)[0]
        item['keywords'] = re.findall(
            r'<meta itemprop="keywords" content="(.*?)"', text)[0]
        item['answer_count'] = re.findall(
            r'<meta itemprop="answerCount" content="(.*?)"', text)[0]
        item['comment_count'] = re.findall(
            r'<meta itemprop="commentCount" content="(.*?)"', text)[0]
        item['flower_count'] = re.findall(
            r'<meta itemprop="zhihu:followerCount" content="(.*?)"', text)[0]
        item['date_created'] = re.findall(
            r'<meta itemprop="dateCreated" content="(.*?)"', text)[0]

        count_answer = int(item['answer_count'])
        yield item

        question_id = int(
            re.match(r'https://www.zhihu.com/question/(\d+)',
                     response.url).group(1))

        # 从指定位置开始获取指定数量答案
        if count_answer > self.answer_count:
            count_answer = self.answer_count
        n = self.answer_offset
        while n + 20 <= count_answer:
            yield scrapy.Request(self.more_answer_url.format(
                question_id, n, n + 20),
                                 headers=self.headers,
                                 callback=self.parse_answer)
            n += 20

    def get_more_question(self, response):
        """ 获取更多首页问题 """
        question_url = 'https://www.zhihu.com/question/{0}'
        questions = json.loads(response.text)

        for que in questions['data']:
            question_id = re.findall(r'(\d+)',
                                     que['target']['question']['url'])[0]
            yield scrapy.Request(question_url.format(question_id),
                                 headers=self.headers,
                                 callback=self.parse_question)

    def parse_answer(self, response):
        """ 解析获取到的指定范围答案 """
        answers = json.loads(response.text)

        for ans in answers['data']:
            item = ZhihuAnswerItem()
            item['question_id'] = re.match(
                r'http://www.zhihu.com/api/v4/questions/(\d+)',
                ans['question']['url']).group(1)
            item['author'] = ans['author']['name']
            item['ans_url'] = ans['url']
            item['comment_count'] = ans['comment_count']
            item['upvote_count'] = ans['voteup_count']
            item['excerpt'] = ans['excerpt']

            yield item
示例#37
0
def main():
    configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
    process = CrawlerProcess(get_project_settings())
    crawler = PageCrawler()
    process.crawl(crawler)
    process.start()
示例#38
0
def spider_task():
    process = CrawlerProcess(get_project_settings())
    process.crawl(HotmovieSpider)
    process.start()
示例#39
0
def connect_db():
    s = get_project_settings()
    return create_engine(URL(**s['DATABASE']))
示例#40
0
def run_crawler():
    process = CrawlerProcess(get_project_settings())

    process.crawl('amazon')
    process.start()
示例#41
0
    def __init__(self):

        self.engine = create_engine(
            get_project_settings().get("CONNECTION_STRING"))
        self.Session = sessionmaker(bind=self.engine)
示例#42
0
def main():
    process = CrawlerProcess(get_project_settings())
    process.crawl(SolutionSpider)
    process.start()
示例#43
0
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import json

import scrapy
# from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.pipelines.images import ImagesPipeline
#DropItem 丢弃item
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
import os
# os.rename(源文件的名称,新的名称)

#获取图片存储的文件夹路径
image_store = get_project_settings().get('IMAGES_STORE')


#图片下载管道文件
class MyImagesPipeline(ImagesPipeline):

    # def file_path(self, request, response=None, info=None):
    #     # 可以返回一个图片的路径
    #     #return 'full/%s.jpg' % (image_guid)
    #     return 'full/%s.jpg' % (图片的名称)

    def get_media_requests(self, item, info):
        # 第一步从item中获取要下载的图片地址
        # for image_url in item['image_urls']:
        #     #根据图片的 url地址构建一个Request对象,
        #     #最终交给调度器
def db_connect():
    """
    Performs database connection using database settings from settings.py.
    Returns sqlalchemy engine instance
    """
    return create_engine(get_project_settings().get("CONNECTION_STRING"))
示例#45
0
    def handle(self, *args, **options):
        process = CrawlerProcess(get_project_settings())

        process.crawl(gocovid)
        process.start()
 def from_crawler(cls, crawler):
     settings = get_project_settings()
     return cls(mongo_uri=crawler.settings.get('MONGODB_URI'),
                mongo_db=settings.get('MONGODB_DATABASE', 'items'))
    def start_requests(self):
        settings = get_project_settings()

        for offset in range(2300):
            yield scrapy.Request(url='https://finance.yahoo.com/screener/unsaved/4b6788ec-db90-477a-a44a-09e5cd5b5027?count=1&offset=' + str(offset), callback=self.parse)
示例#48
0
    def handle(self, *args, **options):
        process = CrawlerProcess(get_project_settings())

        process.crawl(ParliamentMemberSpider)
        process.start()
示例#49
0
def outletpccatalog():
    print("fent catàleg...")

    process = CrawlerProcess(get_project_settings())
    process.crawl(OutletpcCataleg)
    process.start()  # the script will block here until the crawling is finished
示例#50
0
def scrawl():
    process = CrawlerProcess(get_project_settings())
    process.crawl('posts-spider')
    process.start()
示例#51
0
def db_connect():
    return create_engine(get_project_settings().get("CONNECTION_STRING"))
示例#52
0
import os
import time
import re
from scrapy.utils.project import get_project_settings
from spiders.selecrawlers.baseselespider import BaseSeleSpider
from datetime import datetime
from selenium.webdriver.common.keys import Keys
from wangban_utils.single_mode import singleton
from spiders.basemodel import DIYBaseSpider
SETTINGS = get_project_settings()
JSONFILE = os.path.join(SETTINGS['BASE_JSONFILE_PATH'],'changxing.json')

@singleton
class ChangXingSeleSpider(BaseSeleSpider):
    name = 'changxing'
    def __init__(self):
        super().__init__()
        self.post_suf = '__page_{}'
        self.source_url ='http://ggzy.zjcx.gov.cn:8081/cxweb/'


    def get_totalpage(self,driver):
        #获取总页数,没有总页数,设置总页数为1
        try:
            total_page = driver.find_element_by_xpath(self.xp.total_page).text
            total_page = int(total_page)
        except Exception as e:
            total_page = '1'
            print('get total error',e)
            print(driver.current_url)
示例#53
0
class LikeSpider(scrapy.Spider):
    name = 'like'

    allowed_domains = ['yande.re']
    setting = get_project_settings()  # 配置文件,用来取值
    table = pymongo.MongoClient(setting['MONGO_URL'])[setting['MONGO_DB']][
        setting['MONGO_YANDE_POST_DB']]

    # rules = (
    #     Rule(LinkExtractor(allow=r'/post\?page=\d+&tags=vote%3A%3E%3D1%3Akamiyamashiki\+order%3Avote'), callback='parse_list', follow=True),
    # )
    # https://yande.re/post?page=1&tags=vote%3A%3E%3D1%3Akamiyamashiki+order%3Avote
    # /post?page=2&tags=vote%3A%3E%3D1%3Akamiyamashiki+order%3Avote

    def start_requests(self):
        yield scrapy.Request('https://yande.re/user/login',
                             callback=self.login)

    def login(self, response):
        form_data = {
            'user[name]': self.setting['YANDE_USERNAME'],
            'user[password]': self.setting['YANDE_PASSWORD']
        }
        yield FormRequest.from_response(response,
                                        formdata=form_data,
                                        callback=self.after_login)

    def after_login(self, response):
        yield scrapy.Request('https://yande.re/post?tags=vote:' +
                             self.setting['YANDE_VOTE'] + ':' +
                             self.setting['YANDE_USERNAME'] + ' order:vote',
                             callback=self.parse_list)

    def parse_list(self, response):
        item = {}
        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()
        print(response.url)
        print(response.body)
        for url_selector in response.xpath(
                '//li//div[@class="inner"]/a[@class="thumb"]/@href'):
            url = url_selector.get()
            pic_id = int(url.replace('/post/show/', ''))
            if not self.table.find_one({'pic_id': pic_id}):
                yield scrapy.Request('https://' + self.allowed_domains[0] +
                                     url,
                                     callback=self.parse_pic)
            else:
                print('#####图片 ' + str(pic_id) + ' 已存在#####')
            # pass
        if response.xpath('//a[@class="next_page"]/@href').get():
            next_page = 'https://' + self.allowed_domains[0] + response.xpath(
                '//a[@class="next_page"]/@href').get()
            yield scrapy.Request(next_page, callback=self.parse_list)

    def parse_pic(self, response):
        print('开始处理' + response.url)
        pic = Pic()
        pic['unchanged_pic_url'] = response.xpath(
            '//a[@class="original-file-unchanged"]/@href').extract_first()
        pic['changed_pic_url'] = response.xpath(
            '//a[@class="original-file-changed"]/@href').extract_first()
        pic['pic_id'] = int(
            response.xpath(
                '//*[@id="stats"]/ul/li[1]/text()').extract_first().replace(
                    'Id: ', ''))
        pic['rating'] = response.xpath(
            '//li[contains(./text(), "Rating")]/text()').get().replace(
                'Rating: ', '')
        pic['size'] = response.xpath(
            '//*[@id="stats"]/ul/li[contains(./text(), "Size")]/text()').get(
            ).replace('Size: ', '')
        pic['source'] = response.xpath(
            '//*[@id="stats"]/ul/li[contains(./text(), "Source")]/a/@href'
        ).get()
        pic['date'] = datetime.datetime.now()
        # pic['tags'] = '■'.join(response.xpath('//ul[@id="tag-sidebar"]/li/a[contains(./@href, "/post?tags=") and not(@class)]/text()').extract())
        pic['tags'] = response.xpath(
            '//ul[@id="tag-sidebar"]/li/a[contains(./@href, "/post?tags=") and not(@class)]/text()'
        ).extract()
        parent = response.xpath(
            '//div/a[contains(text(), "parent post")]/@href').get()
        if parent:
            pic['parent_pic_id'] = int(parent.replace('/post/show/', ''))
        if response.xpath('//p/span').get():
            pic['pool'] = response.xpath(
                '//p/span/following-sibling::*/text()').get()
            pic['pool_id'] = int(
                response.xpath(
                    '//p/span/following-sibling::*/@href').get().replace(
                        '/pool/show/', ''))
            pic['pool_seq'] = response.xpath('//p/span/text()').get().replace(
                '#', '')
        yield pic
示例#54
0
    def handle(self, *args, **options):
        process = CrawlerProcess(get_project_settings())

        process.crawl(nba.NbaSpider)
        process.start()
示例#55
0
 def process_request(self, request: Any, spider: scrapy.Spider) -> None:
     settings = get_project_settings()
     request.headers.setdefault('User-Agent',
                                random.choice(settings['USER_AGENTS']))
     return None
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.settings = get_project_settings()
     self._proxy_lis = self.proxies
示例#57
0
import logging
from scrapy.utils.project import get_project_settings

from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError

logger = logging.getLogger(__name__)

try:
    from scrapy_mysql import connection
except:
    print("Module is not installed : scrapy_mysql")
    raise

settings = get_project_settings()
url_server = connection.from_settings(settings)

HTTP_ERROR_CODE = -1
DNS_ERROR_CODE = -2
TIMEOUT_CODE = -3
UNKNOWN_HTTP_ERROR = -4


class ResponseErrMiddleware(object):
    def process_exception(self, request, exception, spider):
        url = request.url
        id = getattr(request, "id", None)
        if id is None:
            logger.error("No [id] in Request of url: [{}]".format(url))
from medicalDataSpider.spiders.so39 import So39Spider
from medicalDataSpider.spiders.shiguanzhijia import ShiguanZhijiaSpider
from medicalDataSpider.spiders.jianshu import JianshuSpider
from medicalDataSpider.spiders.haodaifu import HaodaifuSpider
from medicalDataSpider.spiders.fh21 import Fh21Spider
from medicalDataSpider.spiders.bozhong import BozhongSpider
from medicalDataSpider.spiders.babytree import BabytreeSpider
from scrapy.crawler import CrawlerProcess
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from medicalDataSpider.spiders.ask120 import Ask120Spider
from scrapy.utils.log import configure_logging
from keywords import keywords
from short_keywords import short_keywords

configure_logging()
process = CrawlerRunner(get_project_settings())


@defer.inlineCallbacks
def crawl():
    for keyword in keywords:
        yield process.crawl(HaodaifuSpider, keyword)

    reactor.stop()


crawl()
reactor.run()
示例#59
0
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

if __name__ == '__main__':
    process = CrawlerProcess(get_project_settings())
    process.crawl('hupu')  #  你需要将此处的spider_name替换为你自己的爬虫名称
    process.start()
示例#60
0
from scrapy.crawler import CrawlerProcess
from scrapy.utils import project

from myScrapy.spiders.ACG12 import ACG12
from myScrapy.spiders.QiWenBeauty import QiWenBeauty

spiders = {
    QiWenBeauty.name: True,
    ACG12.name: True
}

if __name__ == '__main__':
    setting = project.get_project_settings()
    process = CrawlerProcess(setting)

    for spider_name in process.spiders.list():
        if spiders.get(spider_name):
            print("启动爬虫 >>> %s <<<" % spider_name)
            process.crawl(spider_name)
    process.start()