def prepare(self, fromtext=False, start_idx=0, end_idx=100):
        if not fromtext:
            host = settings.get("REDIS_HOST", REDIS_HOST)
            port = settings.get("REDIS_PORT", REDIS_PORT)
            self.r = _default_redis(host, port)

            uids_set = UIDS_SET.format(spider=self.name)
            log.msg(format="Load uids from %(uids_set)s", level=log.WARNING, uids_set=uids_set)
            uids = self.r.smembers(uids_set)
            if uids == []:
                log.msg(format="Not load any uids from %(uids_set)s", level=log.WARNING, uids_set=uids_set)

        else:
            uids = []
            fname = "uidlist_20140103.txt"
            log.msg(format="Load uids from %(uids_set)s", level=log.WARNING, uids_set=fname)
            if os.getcwd()[-8:] == "cron4win":
                f = open("../test/%s" % fname, "r")
            else:
                f = open("./test/%s" % fname, "r")
            count = 0
            for line in f.readlines():
                count += 1
                if count >= start_idx and count <= end_idx:
                    uids.append(int(line.strip().split(",")[0]))
                elif count < start_idx:
                    pass
                else:
                    break
            if uids == []:
                log.msg(format="Not load any uids from %(uids_set)s", level=log.WARNING, uids_set=fname)
            f.close()

        return uids
示例#2
0
    def login(self, response):
        """Generate a login request."""
        # from scrapy.shell import inspect_response
        # inspect_response(response)
        hxs = HtmlXPathSelector(response)

        email = settings.get('FOLHA_USER')
        password = settings.get('FOLHA_PASS')

        challenge = hxs.select("//form[@name='login']/input[@name='challenge']/@value").extract()[0]
        password_challenge = hashlib.md5(challenge + hashlib.md5(password).hexdigest()).hexdigest()

        data = {'email': email,
                'password_challenge': password_challenge,
                'password': password,
                'challenge': challenge,
                r'auth.x': '1',
                r'auth.y': '1',
                'auth': 'Autenticar'
        }

        return [FormRequest.from_response(response,
                                          formname='login',
                                          formdata=data,
                                          callback=self.check_login_response)]
示例#3
0
 def __init__(self, *args, **kwargs):
     join_multivalued = settings.get('CSV_JOIN_MULTIVALUED', None)
     if join_multivalued:
         kwargs['join_multivalued'] = join_multivalued
     kwargs['delimiter'] = settings.get('CSV_DELIMITER', ',')
     kwargs['fields_to_export'] = settings.get('EXPORT_FIELDS', None)
     super(CSVItemExporter, self).__init__(*args, **kwargs)
示例#4
0
    def open_spider(self, spider):
        res = super(MongoDBPipeline, self).open_spider(spider)
        if not res:
            return

        spider.username = self.username
        spider.password = self.password
        self.ensure_index(RackUsage)
        self.ensure_index(RackServers)
        old_invoices = [i for i in
                self.mongodb[RackServers._collection_name].find(
                    dict(
                        cloud_account_id=self.user_id,
                        invoice_id={"$exists": True, "$ne": ""},
                        enddate={"$exists": True}
                    ))]
        spider.old_invoices = [i['invoice_id'] for i in old_invoices]
        urls = settings.get('URLS')
        base_url = settings.get('BASE_URL')
        specific_urls = settings.get("SPECIFIC_URLS")
        if not self.base_url:
            self.base_url = base_url

        specific_url = specific_urls.get(self.base_url, {})
        for attr, url in urls.items():
            if attr in specific_url:
                url = specific_url[attr]
            setattr(spider, attr, urljoin(self.base_url, url))
    def prepare(self, fromtext=False, start_idx=0, end_idx=100):
        if not fromtext:
            host = settings.get('REDIS_HOST', REDIS_HOST)
            port = settings.get('REDIS_PORT', REDIS_PORT)
            self.r = _default_redis(host, port)

            uids_set = UIDS_SET.format(spider=self.name)
            log.msg(format='Load uids from %(uids_set)s', level=log.WARNING, uids_set=uids_set)
            uids = self.r.smembers(uids_set)
            if uids == []:
                log.msg(format='Not load any uids from %(uids_set)s', level=log.WARNING, uids_set=uids_set)

        else:
            uids = []
            fname = 'uid_about_marine'
            log.msg(format='Load uids from %(uids_set)s', level=log.WARNING, uids_set=fname)
            f = open('./source/%s' % fname)
            count = 0
            for line in f:
                count += 1
                if count >= start_idx and count <= end_idx:
                    uids.append(int(line.strip()))
                elif count > start_idx:
                    break
                else:
                    pass
            if uids == []:
                log.msg(format='Not load any uids from %(uids_set)s', level=log.WARNING, uids_set=fname)
            f.close()    
        
        return uids
示例#6
0
def connect():
    return psycopg2.connect(
        user=settings.get('PG_USER'),
        dbname=settings.get('PG_DBNAME'),
        host=settings.get('PG_HOST'),
        password=settings.get('PG_PASSWORD')
    )
    def __init__(self,name=None,*args,**kwargs):
        '''Initialize weibospider

        Parameters
        ----------
        login            :    login status (True/False)
        start_urls       :    default urls to start crawling
        login_url        :    url to login

        redis_server     :    redis server Connected
        ids_toCrawl_name :    name of toCrawl ids Queue
        ids_crawled_name :    name of crawled ids Queue
        ids_processing_name:  name of ids crawling now Queue
        ids_problem_name :    name of ids sucked Queue
        '''
        super(WeiboSpider,self).__init__(name,*args,**kwargs)
        self.login        =   False
        self.start_urls   =   []
        self.login_url    =   self.weibo.login(self.username, self.password)

        self.redis_server =   redis.Redis(self.REDIS_HOST,self.REDIS_PORT)
        self.ids_toCrawl_name      =   settings.get('REDIS_TOCRAWL_QUEUE'   ,'user_ids_toCrawl'   )
        self.ids_crawled_name      =   settings.get('REDIS_CRAWLED_QUEUE'   ,'user_ids_crawled'   )
        self.ids_processing_name   =   settings.get('REDIS_PROCESSING_QUEUE','user_ids_processing')
        self.ids_problem_name      =   settings.get('REDIS_PROBLEM_QUEUE'   ,'user_ids_problem'   )

        if self.login_url:
            self.start_urls.append(self.login_url)
    def __init__(self, emotion='smile face', start='20100101', end='20151221', interval=30, *args, **kwargs):
        super(FlickrSpider, self).__init__(*args, **kwargs)
        self.emotion = emotion
        self.start_date = start
        self.js_bin = settings.get('JS_BIN')
        self.js_wait = settings.get('JS_WAIT')
        start_date = int(time.mktime(time.strptime(start, '%Y%m%d')))
        end_date = int(time.mktime(time.strptime(end, '%Y%m%d')))

        ONEDAYSECONDS = 24 * 3600
        num = (end_date - start_date) // (int(interval) * ONEDAYSECONDS)
        startlist = [(start_date + i * int(interval) * ONEDAYSECONDS, start_date +
                      (i + 1) * int(interval) * ONEDAYSECONDS) for i in xrange(num)]

        self.start_urls = [
            "http://www.flickr.com/search/?text={0}&view_all=1&media=photos&min_upload_date={1}&max_upload_date={2}"
            .format(emotion, *dateInterval) for dateInterval in startlist
        ]

        self.conn = sqlite3.connect('%s%s.db' % (emotion, start))
        self._create_table()
        service_args = ['--proxy=%s' %
                        settings.get('HTTP_PROXY'), '--proxy-type=http', '--load-images=false']

        self.driver = webdriver.PhantomJS(
            executable_path=self.js_bin, service_args=service_args)
        self.driver.set_window_size(1920, 1080)
        self.rules = [
            Rule(LinkExtractor(allow=['search/?text=%s&view_all=1' % emotion]), )]
示例#9
0
 def after_login(self, response):
     hxs = HtmlXPathSelector(response)
     alert = hxs.select('//ul[@class="message-alert"]').extract()
     if alert:
         print "Invalid login"
         raise CloseSpider(alert)
         return
     self.log.msg("Parsing current usage")
     meta = {}
     for region in settings.get("REGIONS"):
         item = HPCloudService(region=region, number=0)
         meta = {'item': item}
         headers = {
                 'X-Requested-With':'XMLHttpRequest',
                 'Accept':'application/json, text/javascript, */*; q=0.01',
                 }
         yield Request(self._FILES_URL.format(region=region),
                 headers=headers,
             callback=self.parse_files, meta=meta, errback=self.current_error)
         for zone in settings.get("ZONES"):
             item = HPCloudService(region=region, number=0)
             meta = {'item': item, 'zone': zone}
             yield Request(self._SERVERS_URL.format(region=region, zone=zone),
                 callback=self.parse_servers, meta=meta,
                 errback=self.current_error)
     yield Request(url=self._BILLS_URL, callback=self.parse_invoices)
示例#10
0
    def get_start_urls(self):
        """Extracts urls from a text file into the list of URLs to crawl"""
        if not settings.get('URLS'):
            raise ValueError('No text file. Use -s URLS=somefile.txt')

        with open(settings.get('URLS')) as data:
            return [line.rstrip('\r\n') for line in data]
示例#11
0
 def _loop(self, args, opts):
     if settings.get('MEMDEBUG_WITH_GUPPY', False) and guppy:
         heapy = guppy.hpy()
         
     task = Task().next(locked=0, completed=0)
     if task:
         task.lock()
         cmd = ['python', os.path.join(os.getcwd(), 'scrapy-ctl.py'), 'run']
         cmd.append('--task-id=%s'%task.id)
         if opts.child_logfile:
             cmd.append('--logfile=%s'%opts.child_logfile)
             cmd.append('--child')
         task.start = datetime.now()
         process = subprocess.Popen(cmd, shell=False, stderr=subprocess.PIPE, stdout=subprocess.PIPE, close_fds=True)
         task.result, task.errors = process.communicate()
         task.finish = datetime.now()
         task.completed = 1
         task.save()
         timetext.LANG = 'en'
         total = task.finish - task.start
         log.msg('Finished: %s(%s) in %s'%(task.name, task.id, timetext.stringify(total)), level=log.INFO, domain=task.domain)
         if settings.get('MEMDEBUG_WITH_GUPPY', False) and guppy:
             log.msg(heapy.heap(), level=log.DEBUG)
             heapy.setref()
     else:
         time.sleep(30)
示例#12
0
 def from_settings(cls,settings):
     ret = {
         'mongo_server':settings.get('MONGODB_SERVER',MONGODB_SERVER),
         'mongo_port':settings.get('MONGODB_PORT',MONGODB_PORT),
         'mongo_db_name':settings.get('MONGODB_DB',MONGODB_DB),
     }
     return cls(**ret)
示例#13
0
 def load(self, task):
     '''
     Gets task for the spider, loads the tasks's module code and applies code
     from configuration to the spider.
     '''
     self.task = task
     configuration = None
     if settings.get('TASKS'):
         available_tasks = settings.get('TASKS')
         if available_tasks.has_key(task.name):
             try:
                 configuration = load_object(available_tasks[task.name])
             except Exception, (ErrorMessage):
                 log.msg('Could not load configuration for task %s' % task.name, level=log.ERROR)
                 log.msg(ErrorMessage, level=log.DEBUG, domain='tripcentral.ca')
             configuration = configuration(task, self)
             if hasattr(configuration, 'start_urls'):
                 setattr(self, 'start_urls', configuration.start_urls)
             if hasattr(configuration, 'rules'):
                 setattr(self, 'rules', configuration.rules)
             if hasattr(configuration, 'parse_start_url'):
                 setattr(self, 'parse_start_url', configuration.parse_start_url)
             self.start_urls = self.get_start_urls()
             self._compile_rules()                
         else:
             log.msg('%s is not defined in settings.TASKS' % task.name, level=log.ERROR, domain=task.domain )
示例#14
0
 def __init__(self, *args, **kwargs):
    kwargs['fields_to_export'] = settings.getlist('EXPORT_FIELDS') or None
    kwargs['encoding'] = settings.get('EXPORT_ENCODING', 'utf-8')
    delimiter = settings.get('CSV_DELIMITER', '|')
    kwargs['delimiter'] = delimiter
    kwargs['include_headers_line'] = False
    super(ProductCSVExporter, self).__init__(*args, **kwargs)
示例#15
0
 def __init__(self):
     self.queries = settings.get('GOOGLER_QUERIES')
     self.pages_to_get = settings.get('GOOGLER_PAGES_TO_GET_FROM_ENGINE')
     self.engines = googler.utils.loading.load_modules("googler.engines", settings.get('GOOGLER_USE_ENGINES'))
     self.load_crawling_config()
     self.forbid_regexps = map(re.compile, settings.get('GOOGLER_FORBID_URLS'))
     super(GooglerSpider, self).__init__()
示例#16
0
文件: base.py 项目: epigos/news
    def parse_item_wrapper(self, response):
        """Wrapper for parse_item enabling exception notifications."""
        try:
            item = self.parse_item(response)
            return item
        except Exception, ex:
            url = None
            if response.url:
                url = response.url

            quarantine_database = get_quarantine_database()
            if quarantine_database and settings.get('QUARANTINE_MODE'):
                e = {
                    'exception': str(type(ex)),
                    'stacktrace': traceback.format_exc(),
                    'link': url
                }
                quarantine_database.save_exception(e)

            if settings.get('DEBUG'):
                self.log('Spider Exception trying to parse: ' + url)
                self.log(str(type(ex)) + " - " + traceback.format_exc())
            if not isinstance(ex, DropItem):
                self.log_exceptions += 1
            raise
示例#17
0
 def __init__(self, *args, **kwargs):
     delimiter = settings.get('CSV_DELIMITER', ',')
     kwargs['delimiter'] = delimiter
     fields_to_export = settings.get('FIELDS_TO_EXPORT', [])
     if fields_to_export:
         kwargs['fields_to_export'] = fields_to_export
     super(YellowpagesItemExporter, self).__init__(*args, **kwargs)
    def __init__(self, *args, **kwargs):
        delimiter = settings.get("CSV_DELIMITER", ",")
        kwargs["delimiter"] = delimiter

        fields_to_export = settings.get("FIELDS_TO_EXPORT", [])
        if fields_to_export:
            kwargs["fields_to_export"] = fields_to_export
            super(MyProjectCsvItemExporter, self).__init__(*args, **kwargs)
示例#19
0
 def __init__(self): # {{{
     db_url = settings.get("DB_URL")
     table_name = settings.get("DB_TABLE")
     if not db_url or not table_name:
         raise NotConfigured
     self.engine = create_engine(db_url, echo=False)
     self.metadata = MetaData(bind=self.engine)
     self.table = Table(table_name, self.metadata, autoload=True)
示例#20
0
    def __init__(self):
        connection = pymongo.MongoClient(
            settings.get('MONGODB_SERVER'),
            settings.get('MONGODB_PORT')
        )

        db = connection[settings.get('MONGODB_DB')]
        self.collection = db[settings.get('MONGODB_TABLE')]
    def from_settings(cls, settings):
        server = redis.Redis(host=settings.get('REDIS_HOST'),
                                            port=settings.get('REDIS_PORT'))
        persist = settings.get('SCHEDULER_PERSIST', True)
        timeout = settings.get('DUPEFILTER_TIMEOUT', 600)
        retries = settings.get('SCHEDULER_ITEM_RETRIES', 3)

        return cls(server, persist, timeout, retries)
示例#22
0
 def from_settings(cls, settings):
     cls.MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0)
     cls.MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0)
     cls.EXPIRES = settings.getint('IMAGES_EXPIRES', 90)
     cls.THUMBS = settings.get('IMAGES_THUMBS', {})
     cls.IMAGES_URLS_FIELD = settings.get('IMAGES_URLS_FIELD', cls.DEFAULT_IMAGES_URLS_FIELD)
     cls.IMAGES_RESULT_FIELD = settings.get('IMAGES_RESULT_FIELD', cls.DEFAULT_IMAGES_RESULT_FIELD)
     store_uri = settings['IMAGES_STORE']
     return cls(store_uri)
示例#23
0
 def __init__(self):
     self.dbpool = adbapi.ConnectionPool(
         "MySQLdb",
         db=settings.get("DATABASE_NAME"),
         user=settings.get("DATABASE_USER"),
         passwd=settings.get("DATABASE_PASSWORD"),
         cursorclass=MySQLdb.cursors.DictCursor,
         charset="utf8",
         use_unicode=True,
     )
示例#24
0
def push_data(items):
    data = {
        'items': json.dumps(items),
        }
    encoded_data = urllib.urlencode(data)
    remote_server = '127.0.0.1:8000'
    if settings.get('REMOTE_SERVER'):
        remote_server = settings.get('REMOTE_SERVER')
    text = urllib2.urlopen('http://%s/robot/push' % remote_server, encoded_data).read()
    print text
示例#25
0
 def login(self, resp):
     user = settings.get('VIETNAMWORK_USERNAME')
     password = settings.get('VIETNAMWORK_PASSWORD')
     return FormRequest.from_response(resp,
                                      method='POST',
                                      formdata={'form[username]': user,
                                                'form[password]': password},
                                      callback=self.check_login,
                                      dont_filter=True
                                      )
示例#26
0
 def __init__(self, id='keywordSpider'):
     self.rules = rules.rules
     self.seeds = json.JSONDecoder('utf-8').decode(''.join(open(settings.get('SEEDS')).readlines()))
     self.id = id
     self.start_urls = []
     self.ts = datetime.now()
     self.domain = settings.get('DOMAIN')
     self.seed = self.seeds.get(self.domain)
     self.rule = self.rules.get(self.domain)
     self.getQueryWord()
示例#27
0
文件: index.py 项目: Syhen/heretofore
 def get_books(self):
     mongo_uri = settings.get("MONGO_URI")
     db_name = settings.get("DB_NAME")
     auth = settings.get("AUTH")
     client = pymongo.MongoClient(mongo_uri)
     db = client[db_name]
     if auth:
         db.authenticate(**auth)
     books = set(i['book_id'] for i in db['book_index'].find({'source_id': 21}, {'book_id': 1}))
     self.books = books
示例#28
0
 def __init__(self):
     dbargs = settings.get('DB_CONNECT')
     db_server = settings.get('DB_SERVER')
     dbpool = adbapi.ConnectionPool(db_server,**dbargs)
     self.dbpool = dbpool
     #更新看过的id列表
     d = self.dbpool.runInteraction(self.update_feed_seen_ids)
     d.addErrback(self._database_error)
     u = self.dbpool.runInteraction(self.update_user_seen_ids)
     u.addErrback(self._database_error)
示例#29
0
 def process_request(self, request, spider):
     hostname = urlparse(request.url).hostname
     solr = pysolr.Solr(settings.get('SOLR_CONNECTION'), timeout=10)
     query = 'domain:*' + hostname.split(".")[-2] + '.onion*'
     if solr.search(query).hits > settings.get('MAX_PER_DOMAIN'):
         # Do not execute this request
         request.meta['proxy'] = ""
         msg = "Ignoring request {}, More than 1000 sites crawled from this domain.".format(request.url)
         log.msg(msg, level=log.INFO)
         raise IgnoreRequest()
示例#30
0
文件: index.py 项目: Syhen/heretofore
 def __init__(self, **kwargs):
     super(ChuangshiIndexSpider, self).__init__(**kwargs)
     mongo_uri = settings.get("MONGO_URI")
     db_name = settings.get("DB_NAME")
     auth = settings.get("AUTH")
     client = pymongo.MongoClient(mongo_uri)
     db = client[db_name]
     if auth:
         db.authenticate(**auth)
     books = set(i['book_id'] for i in db['book_index'].find({'source_id': 8}, {'book_id': 1}))
     self.books = books
示例#31
0
 def process_request(self, request, spider):
     addproxy = random.randrange(0, 1000)
     if addproxy > 500:
         request.meta['proxy'] = settings.get('HTTP_PROXY')
示例#32
0
 def open_spider(self, spider):
     file_path = settings.get("FILE_PATH")
     self.file = open(file_path, 'w', encoding='utf-8')
示例#33
0
 def process_request(self, request, spider):
     ua = random.choice(settings.get('USER_AGENT_LIST'))
     if ua:
         request.headers.setdefault('User-Agent', ua)
示例#34
0
 def __init__(self, *args, **kwargs):
     export_fields = settings.get('FIELDS_TO_EXPORT', [])
     if export_fields:
         kwargs['fields_to_export'] = export_fields
     super(TopVNCsvItemExporter, self).__init__(*args, **kwargs)
示例#35
0
class ZhipinSpider(scrapy.Spider):

    name = "boss"
    allowed_domains = ["www.zhipin.com"]
    current_page = 1  #开始页码
    start_urls = [
        "https://www.zhipin.com/mobile/jobs.json?city=" +
        settings.get("BOSS_CITY_CODE") + "&query=" + settings.get("LANGUAGE"),
    ]
    custom_settings = {
        "ITEM_PIPELINES": {
            'tutorial.pipelines.ZhipinPipeline': 300,
        },
        "DOWNLOADER_MIDDLEWARES": {
            'tutorial.middlewares.ZhipinMiddleware': 299,
            #   'tutorial.middlewares.ProxyMiddleware':301
        },
        "DEFAULT_REQUEST_HEADERS": {
            'Accept':
            'application/json',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Linux; Android 9.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Mobile Safari/537.36',
            'Referer':
            'https://www.zhipin.com/',
            'X-Requested-With':
            "XMLHttpRequest",
            "cookie":
            "lastCity=101020100; JSESSIONID="
            "; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1532401467,1532435274,1532511047,1532534098; __c=1532534098; __g=-; __l=l=%2Fwww.zhipin.com%2F&r=; toUrl=https%3A%2F%2Fwww.zhipin.com%2Fc101020100-p100103%2F; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1532581213; __a=4090516.1532500938.1532516360.1532534098.11.3.7.11"
        }
    }

    def parse(self, response):
        js = json.loads(response.body)
        html = js['html']
        q = Selector(text=html)
        items = q.css('.item')
        host = 'https://www.zhipin.com'
        x = 1
        redis_host = settings.get('REDIS_HOST')
        redis_port = settings.get('REDIS_PORT')
        #初始化redis
        pool = redis.ConnectionPool(host=redis_host,
                                    port=redis_port,
                                    decode_responses=True)
        r = redis.Redis(connection_pool=pool)
        setkey = settings.get('REDIS_POSITION_KEY')
        for item in items:
            url = host + item.css('a::attr(href)').extract_first()
            position_name = item.css('h4::text').extract_first()  #职位名称
            salary = item.css('.salary::text').extract_first() or ''  #薪资
            work_year = item.css(
                '.msg em:nth-child(2)::text').extract_first() or '不限'  #工作年限
            educational = item.css(
                '.msg em:nth-child(3)::text').extract_first()  #教育程度
            meta = {
                "position_name": position_name,
                "salary": salary,
                "work_year": work_year,
                "educational": educational
            }
            sleep_seconds = int(settings.get('SLEEP_SECONDS'))
            time.sleep(int(random.uniform(sleep_seconds, sleep_seconds + 20)))

            position_id = url.split("/")[-1].split('.')[0]
            print(position_id)
            if (r.sadd(setkey, position_id)) == 1:
                yield Request(url, callback=self.parse_item, meta=meta)
        max_page = settings.get('MAX_PAGE')
        if self.current_page < max_page:
            self.current_page += 1
            api_url = "https://www.zhipin.com/mobile/jobs.json?city=" + settings.get(
                "BOSS_CITY_CODE") + "&query=" + settings.get(
                    "LANGUAGE") + "&page=" + str(self.current_page)
            time.sleep(int(random.uniform(sleep_seconds, sleep_seconds + 20)))
            yield Request(api_url, callback=self.parse)
        pass

    def parse_item(self, response):
        item = TutorialItem()
        q = response.css
        item['address'] = q('.location-address::text').extract_first()
        item['create_time'] = q('.job-tags .time::text').extract_first()
        item['body'] = q('.text').xpath('string(.)').extract_first()
        item['company_name'] = q('.business-info h4::text').extract_first()
        item['postion_id'] = response.url.split("/")[-1].split('.')[0]
        item = dict(item, **response.meta)
        yield item
示例#36
0
class A91Spider(scrapy.Spider):
    name = '91'
    allowed_domains = ['www.91porn.com']
    # start_urls = ['https://www.google.com/']
    start_urls = ['http://www.91porn.com/v.php?next=watch&page=2859']

    proxies_ = settings.get('PROXIES')
    max_page = 4698
    cookies = settings.get("COOKIES")

    #
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url=url, callback=self.parse,
                                 meta={'proxy': random.choice(self.proxies_)}, cookies=self.cookies)

    def parse(self, response):
        # print(response.text)
        doc = PyQuery(response.text)
        rows = doc('.videos-text-align').items()
        for row in rows:
            try:
                # print(row.text())

                split = row.text().strip()
                item = a91Item()
                r1 = split.split("积分:")
                item['score'] = r1[1]
                r2 = r1[0].split("留言:")
                item['msg'] = r2[1]
                r3 = r2[0].split("收藏:")
                item['favorite'] = r3[1]
                r4 = r3[0].split("查看:")
                item["views"] = r4[1]
                r5 = r4[0].split("作者:")
                item['author'] = r5[1]
                r6 = r5[0].split("添加时间:")
                item['add_time'] = r6[1].strip()
                r7 = r6[0].split(" ")
                if r6[0].lower().startswith("hd"):
                    item['time'] = r6[0][3:8]
                    item['title'] = r6[0][8:]
                else:
                    item['time'] = r6[0][0:5]
                    item['title'] = r6[0][5:]
                if utils.time_cmp(item['time'], min_time) < 0:
                    continue
                img = row.find(".img-responsive")
                if img is not None:
                    item['img'] = img.attr('src')
                else:
                    item['img'] = None
                href = doc('.videos-text-align a').eq(0).attr("href")
                item['video_url'] = self.get_video_url(href)
                item['cell_url'] = href
                yield item
            except Exception as e:
                text = doc('span.pagingnav').text()
                error_list.append(text)
                print("parse" + e.__str__())
        navs = doc(".pagingnav a")
        navs_eq = navs.eq(navs.length - 1)
        if navs_eq.text() == "»" and navs_eq.attr("href") is not None:
            print(prefix + navs_eq.attr("href"))
            yield scrapy.Request(prefix + navs_eq.attr("href"), callback=self.parse,
                                 meta={'proxy': random.choice(self.proxies_)}, cookies=self.cookies)

    def get_video_url(self, href):
        mget = utils.mget(href)
        if mget is None:
            return ""
        doc = PyQuery(mget.text)
        split = doc('#player_one script').eq(0).text().replace("\"", "").split("(")[2].split(",")
        with open("D:\develop\Python\scrapy\demo1\demo1\spiders\md5.js", "r") as f:
            data_func = f.read()  # 读取js文件
        tk = execjs.compile(data_func)  # 编译执行js代码
        a = "NS0tQCoqBCQKMg0AWjwoUwFaEGcTPzUBIRNBPCcAUi14AmRpIgwfS3QjETYLDVw7BigDKARRIQgLLXJ3N2x6MjALMEIaZR4NOy1nLhgEO3AqYiExCBNzHy0Obj8/A389KnA+R1AcORw8GihrIggYAiswMk00CSoFDX5hUhUjB3dydQ8BFX8UIzsdaUIrJgAd"
        b = "eec6NrNNPaOz9QejKhxWwwt7mjyDhT5X5h1Xnfx28IzNGteOelRRH+lqFG7Fz/OFSOamyVO4nh1lV5KCd7UzlF5fxcWneh5syBp44ecplNmZlbM2dtQ4zMokD63gvdRN8FUqO8BUw/X5"
        '''strencode()'''
        tk = tk.call('strencode', split[0], split[1])  # 调用函数 token为js里面的函数  a为传的参数
        # tk = tk.call('strencode', a, b)  # 调用函数 token为js里面的函数  a为传的参数
        tk = str(tk).split('src=\'')[1].split("'")[0]
        # print('tk', tk)
        return tk
示例#37
0
 def closed(self, reason):
     # Write to TXT File
     with open(settings.get('STOCKLIST_FILE'), 'w') as txtfile:
         for row in self.stocklist_mainboard + self.stocklist_gem:
             txtfile.write(row[0] + ',' + row[1] + '\n')
示例#38
0
 def __init__(self):
     self.db = sa.create_engine(settings.get('MYSQLDB'), encoding='utf-8')
     self.conn = self.db.raw_connection()
     self.cursor = self.conn.cursor()
     # self.table = self.table[spider.name]
     self.count = 0
示例#39
0
class AppWashingtonPostSpider(scrapy.Spider):
    name = 'app.Washington.post'
    limittime = settings.get('CRAWL_START_DATE')

    entry_point = {
        'world': 'world/',
        'business': 'business/',
        'national': 'national/',
        'politics': 'politics/',
    }

    headers = {
        'authority': "www.washingtonpost.com",
        'cache-control': "max-age=0,no-cache",
        'upgrade-insecure-requests': "1",
        'user-agent':
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
        'accept':
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
        'accept-encoding': "gzip, deflate, br",
        'accept-language': "zh-CN,zh;q=0.9,en;q=0.8",
    }

    # querystring = {"id": "fB7vVe1tAPpwur",
    #                "contentConfig": '{"path":"/world/?query=/WashingtonPost/Production/Digital/Queries/site-service/world/global-wire-feed&limit=15&offset=51"}',
    #                "uri": "/pb/world/",
    #                "service": "com.washingtonpost.webapps.pagebuilder.services.StoryAdapterService"}
    #
    # querystring = {"id": "f0TOnrBpFnCqur",
    #                "contentConfig": '{"path":"/national/?query=/WashingtonPost/Production/Digital/Pages-Tablet/business/_module-content/refresh-query-ipadbiz&limit=15&offset=20"}',
    #                "uri": "/pb/business/",
    #                "service": "com.washingtonpost.webapps.pagebuilder.services.StoryAdapterService"}
    #
    # querystring = {"id": "fnFGqW1K2HCqur",
    #                "contentConfig": '{"path":"/national/?query=/WashingtonPost/Production/Digital/Pages-Tablet/national/_module-content/refresh-query-ipadnational&limit=15&offset=20"}',
    #                "uri": "/pb/national/",
    #                "service": "com.washingtonpost.webapps.pagebuilder.services.StoryAdapterService"}
    #
    # querystring = {"id": "fzUfk61n8wBqur",
    #                "contentConfig": '{"path":"/politics/?limit=15&offset=20"}',
    #                "uri": "/pb/politics/",
    #                "service": "com.washingtonpost.webapps.pagebuilder.services.StoryAdapterService"}

    def start_requests(self):
        for key in self.entry_point.keys():
            yield Request(url='https://www.washingtonpost.com/{}'.format(
                self.entry_point[key]),
                          method='GET',
                          callback=self.parse,
                          headers=self.headers,
                          dont_filter=True)

    def parse(self, response):
        links = response.css('.story-headline a::attr(href)').extract()
        for link in set(links):
            yield Request(url=link,
                          method='GET',
                          callback=self.content_parse,
                          headers=self.headers)

    def content_parse(self, response):
        date = response.css('.author-timestamp::attr(content)').extract_first()
        date = helper.list2str(
            re.findall('(\d{4}-\d{2}-\d{2}|\d{2}:\d{2})', date))
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor4Item()

        pipleitem['date'] = date
        id = re.findall('(\d{2,4}/).*', response.url)
        pipleitem['id'] = id[0] if len(id) > 0 else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('.title::text').extract_first()
        pipleitem['source'] = 'WashingtonPost'
        pipleitem['content'] = helper.list2str(
            response.css('.paywall p').xpath('string(.)').extract())
        pipleitem['editor'] = response.css('.author::text').extract_first()
        pipleitem['views'] = None
        pipleitem['image_urls'] = helper.list2str(
            response.css('.paywall img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(
            response.css('.paywall video::attr(src)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        url = re.findall('"@id":(\S*)', response.text)[0]
        rs = requests.get(url='{asset(url:' + url + '){totalCommentCount}}',
                          headers=self.headers).text
        pipleitem['comment'] = re.findall(
            '\d*', rs)[0] if len(re.findall('\d*', rs)) > 0 else None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
示例#40
0
 def __init__(self):
     log.init_log(settings.get('LOG_DIR'))
     logging.info("spider start......")
     print "spider start......"
     logging.info("fafafa")
示例#41
0
 def __init__(self):
     log.init_log(settings.get('LOG_DIR'))
 def __set_page_range(self):
     self.__range_list['start'] = settings.get('WEIBO_INFO_START_PAGE')
     self.__range_list['end'] = settings.get('WEIBO_INFO_END_PAGE')
    def from_settings(cls, settings):
        server = redis.Redis(host=settings.get('REDIS_HOST'),
                             port=settings.get('REDIS_PORT'))
        persist = settings.get('SCHEDULER_PERSIST', True)
        up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10)
        hits = settings.get('QUEUE_HITS', 10)
        window = settings.get('QUEUE_WINDOW', 60)
        mod = settings.get('QUEUE_MODERATED', False)
        timeout = settings.get('DUPEFILTER_TIMEOUT', 600)
        ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60)
        add_type = settings.get('SCHEDULER_TYPE_ENABLED', False)
        add_ip = settings.get('SCHEDULER_IP_ENABLED', False)
        retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3)
        ip_regex = settings.get('IP_ADDR_REGEX', '.*')

        my_level = settings.get('SC_LOG_LEVEL', 'INFO')
        my_name = settings.get('SC_LOGGER_NAME', 'sc-logger')
        my_output = settings.get('SC_LOG_STDOUT', True)
        my_json = settings.get('SC_LOG_JSON', False)
        my_dir = settings.get('SC_LOG_DIR', 'logs')
        my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB')
        my_file = settings.get('SC_LOG_FILE', 'main.log')
        my_backups = settings.get('SC_LOG_BACKUPS', 5)

        logger = LogFactory.get_instance(json=my_json,
                                         name=my_name,
                                         stdout=my_output,
                                         level=my_level,
                                         dir=my_dir,
                                         file=my_file,
                                         bytes=my_bytes,
                                         backups=my_backups)

        return cls(server, persist, up_int, timeout, retries, logger, hits,
                   window, mod, ip_refresh, add_type, add_ip, ip_regex)
示例#44
0
 def process_request(self, request, spider):
     if 'proxy' not in request.meta:
         proxy = getattr(spider, 'proxy', settings.get('PROXY'))
         if proxy:
             request.meta['proxy'] = proxy
示例#45
0
 def process_request(self, request, spider):
     request.meta['proxy'] = settings.get('HTTP_PROXY')
     logger.debug('using proxy %s' % request.meta['proxy'] ) 
示例#46
0
 def __init__(self):
     self.redis_db = redis.Redis(host=settings.get('REDIS_HOST'),
                                 port=settings.get('REDIS_PORT'),
                                 db=1,
                                 password=settings.get('REDIS_PASSWD'))
     self.url_uuid = "news_uuid"
示例#47
0
class MySpider(RedisSpider):
    name = 'caijing_yicai_Agu'
    allowed_domains = ['www.yicai.com']
    ori_path = settings.get('ORI_PATH')
    encoding = "utf-8"
    start_urls = [
        "https://www.yicai.com/news/gushi/",
    ]
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0'
    }

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url,
                                 callback=self.parse,
                                 headers=self.headers,
                                 dont_filter=True)

    def parse(self, response):
        start_url = response.url
        try:
            data = htmlparser.Parser(response.body.decode(self.encoding))
        except Exception as e:
            print('response failed %s' % e)
            return
        org_list = data.xpathall('''//div[@id="newslist"]/a''')
        # for org in org_list[:5]:
        for org in org_list:
            if org:
                title = org.xpath('''//h2/text()''').text().strip()
                ctime = org.xpath('''//div[@class="author"]/span''').regex(
                    '(\d+-\d+-\d+ \d+:\d+)').text().strip()
                org_url = org.xpath('''//@href''').text().strip()
                if title:
                    url = urljoin(start_url, org_url)
                    print(url)
                    ctime = local_timestamp(ctime)
                    item = {'ctime': ctime, 'title': title}
                    print(item)
                    yield scrapy.Request(url,
                                         callback=self.detail_parse,
                                         meta={'item': item},
                                         headers=self.headers,
                                         dont_filter=True)

    def detail_parse(self, response):
        item = response.meta['item']
        try:
            data = htmlparser.Parser(response.body.decode(self.encoding))
        except Exception as e:
            print('second response failed %s' % e)
            return
        url = response.url
        contents = []  # 全部的文本内容
        content_list = data.xpathall('''//div[@class="m-txt"]/p''')
        for con in content_list:
            con = con.text().strip()
            if con:
                contents.append(con)
        content_x = data.xpath('''//div[@class="m-txt"]''').data
        content_xml = content_x
        label = {}
        img_list = data.xpathall('''//div[@class="m-txt"]//img''')
        if img_list:
            for count, image in enumerate(img_list):
                image_dict = {}
                image_url = image.xpath('//@src').text().strip()
                if image_url:
                    image_url = urljoin(url, image_url)
                    node = '#image{}#'.format(count)
                    file_name = image_url.split('/')[-1]
                    image_dict['url'] = image_url
                    image_dict['name'] = ''
                    image_dict['file_name'] = file_name
                    label[node] = image_dict

        table_list = data.xpathall('''//div[@class="m-txt"]//table''')
        if table_list:
            for count, table in enumerate(table_list):
                table_dict = {}
                node = "#table{}#".format(count)
                table_sele = table.data
                table_dict['table_xml'] = table_sele
                node_p = "<p>" + node + "</p>"
                content_x = content_x.replace(table_sele, node_p)
                label[node] = table_dict
        xml = htmlparser.Parser(content_x)
        web_contents = []  # web直接展示的content(表格替换成node)
        content_list = xml.xpathall('''//p''')
        for con in content_list:
            con = con.text().strip()
            if con:
                web_contents.append(con)
        breadcrumb = ["首页", "新闻", "A股"]
        article_info = {}
        channel = 'A股'
        accessory = []  # 附件
        # all_acc = data.xpathall('''//div[@class="ewb-info-con"]//a''')
        # if all_acc:
        #     for acc in all_acc:
        #         temp = {}
        #         acc_url = acc.xpath('//@href').text().strip()
        #         if acc_url and '@' not in acc_url:
        #             acc_url = urljoin(url, acc_url)
        #             name = acc.text().strip()
        #             file_name = acc_url.split('/')[-1].split('=')[-1]
        #             temp['url'] = acc_url
        #             temp['name'] = name
        #             temp['file_name'] = file_name
        #             dir_path = os.path.join(self.ori_path, self.dir_name)
        #             if not os.path.isdir(dir_path):
        #                 os.makedirs(dir_path)
        #             path = os.path.join(dir_path, file_name)
        #             dow_img_acc(path, acc_url)
        #             # file_content = parse_main(path)
        #             temp['file_content'] = '' # file_content
        #             accessory.append(temp)
        gtime = int(time.time())
        main_business = ''
        source = data.xpath(
            '''//div[@class="title f-pr"]/p/span/text()''').text().strip()
        webname = '第一财经'
        domain = self.allowed_domains[0]
        uid = add_uuid(url)
        item["collection_name"] = "news_finance_yicai_raw"  # 集合名
        item["url"] = url  # 链接
        item["uid"] = uid  # 去重id
        item["contents"] = contents  # 数据处理的内容
        item["web_contents"] = web_contents  # 前端使用的内容
        item["article_info"] = article_info  # 文章的相关信息
        item["label"] = label  # 图片、表格
        item["accessory"] = accessory  # 附件
        item["gtime"] = gtime  # 爬虫时间
        item['breadcrumb'] = breadcrumb  # 导航
        item['channel'] = channel  # 频道
        item["spider_name"] = self.name  # 爬虫名
        item["webname"] = webname  # 网站名
        item["domain"] = domain  # 域名
        item["source"] = source  # 来源
        item["main_business"] = main_business  # 相关行业
        item['path'] = ''  # 附件路径
        yield item
示例#48
0
 def from_crawler(cls, crawler):
     return cls(mongo_uri=crawler.settings.get('MONGODB_URI'),
                mongo_db=settings.get('MONGODB_DATABASE', 'items'))
示例#49
0
    def process_request(self, request, spider):

        request.meta['proxy'] = settings.get('HTTP_PROXY')
示例#50
0
class OnionSpider(CrawlSpider):
    name = "OnionSpider"
    ALLOWED_DOMAINS = settings.get('ALLOWED_DOMAINS')

    if ALLOWED_DOMAINS and os.path.isfile(ALLOWED_DOMAINS):
        # Read a list of URLs from file
        # Create the target file list
        with open(ALLOWED_DOMAINS) as f:
            allowed_domains = f.read().splitlines()  # Make it to Python list
            allowed_domains = filter(None,
                                     allowed_domains)  # Remove empty strings
    else:
        allowed_domains = ["onion"]

    TARGET_SITES = settings.get('TARGET_SITES')

    if TARGET_SITES and os.path.isfile(TARGET_SITES):
        # Read a list of URLs from file
        # Create the target file list
        with open(TARGET_SITES) as f:
            start_urls = f.read().splitlines()  # Make it to Python list
            start_urls = filter(None, start_urls)  # Remove empty strings
    else:
        start_urls = [
            'https://ahmia.fi/address/',
        ]

    rules = (Rule(LinkExtractor(), callback='parse_item', follow=True), )

    def parse_item(self, response):
        hxs = Selector(response)
        item = CrawledWebsiteItem()
        # Also the header
        item['header'] = response.headers
        item['url'] = response.url
        # Add the domain
        domain = urlparse(item['url']).hostname
        item['domain'] = domain
        title_list = hxs.xpath('//title/text()').extract()
        h1_list = hxs.xpath("//h1/text()").extract()
        item['h1'] = " ".join(h1_list)
        h2_list = hxs.xpath("//h2/text()").extract()
        item['h2'] = " ".join(h2_list)
        title = ' '.join(title_list)
        item['title'] = title
        encoding = self.detect_encoding(response)
        decoded_html = response.body.decode(encoding, 'ignore')
        html_text = self.html2string(decoded_html)
        words = self.extract_words(html_text)
        item['text'] = title + " " + " ".join(words)
        # For each link on this page
        item['links'] = []
        links = hxs.xpath('//a')
        for link in links:
            link_obj = {}
            # Extract the link's URL
            link_str = " ".join(link.xpath('@href').extract())
            link_obj['link'] = link_str.replace("\n", "")
            # Extract the links value
            link_name_str = " ".join(link.xpath('text()').extract())
            link_name_str = link_name_str.replace("\n", "")
            link_name_str = link_name_str.lstrip()
            link_name_str = link_name_str.rstrip()
            link_obj['link_name'] = link_name_str
            item['links'].append(link_obj)
        return item

    def detect_encoding(self, response):
        return response.headers.encoding or "utf-8"

    def html2string(self, decoded_html):
        """HTML 2 string converter. Returns a string."""
        converter = html2text.HTML2Text()
        converter.ignore_links = True
        string = converter.handle(decoded_html)
        return string

    def extract_words(self, html_string):
        """Create a word list."""
        string_list = re.split(r' |\n|#|\*', html_string)
        # Cut a word list that is larger than 10000 words
        if len(string_list) > 10000:
            string_list = string_list[0:10000]
        words = []
        for word in string_list:
            # Word must be longer than 0 letter
            # And shorter than 45
            # The longest word in a major English dictionary is
            # Pneumonoultramicroscopicsilicovolcanoconiosis (45 letters)
            if len(word) > 0 and len(word) <= 45:
                words.append(word)
        return words
示例#51
0
 def open_spider(self, spider):
     db_config = settings.get("MONGODB_CONFIG")
     self.client = MongoClient(db_config["url"])
     db = self.client[db_config["db"]]
     self.collection = db.line
示例#52
0
 def __init__(self):
     '''
     初始化
     '''
     self.dbItem = settings.get('DB_ITEM')
示例#53
0
class NewsYorkbbsCaSpider(scrapy.Spider):
    name = 'news.yorkbbs.ca'
    limittime = settings.get('CRAWL_START_DATE')

    entry_point = {
        '综合': ['https://news.yorkbbs.ca/api/getlist', 'world'],
        '本地': ['https://news.yorkbbs.ca/api/getlist', 'local'],
        '专题': ['https://news.yorkbbs.ca/api/getMore/topics', 'breakingnews'],
    }

    param = {'type': None, 'pageIndex': '1', 'pageSize': '20'}

    commentpar = {
        'articleId': None,
        'origin': 'news',
        'device': 'pc',
        'pageIndex': '1',
        'pageSize': '20'
    }

    headers = {
        'Accept':
        '*/*',
        'Content-Type':
        'application/x-www-form-urlencoded; charset=UTF-8',
        'Origin':
        'http://news.yorkbbs.ca',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }

    commentheaders = {
        'Accept':
        '*/*',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.9,en;q=0.8',
        'Origin':
        'http://news.yorkbbs.ca',
        'Referer':
        None,
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }

    def start_requests(self):
        for key in self.entry_point.keys():
            self.param['type'] = self.entry_point[key][1]
            yield FormRequest(url=self.entry_point[key][0],
                              formdata=self.param,
                              callback=self.parse,
                              headers=self.headers,
                              dont_filter=True,
                              meta={'type': key})

    def parse(self, response):
        jsbd = json.loads(response.text)
        for item in jsbd['result']:
            if 'contentid' not in item.keys() or len(str(
                    item['contentid'])) == 0:
                continue
            if response.meta['type'] == '专题':
                yield Request(
                    url='https://news.yorkbbs.ca/breakingnews/{}'.format(
                        item['contentid']),
                    method='GET',
                    callback=self.zhuanti_parse,
                    headers=self.headers,
                    dont_filter=True)
            else:
                yield Request(url='https://news.yorkbbs.ca/{}'.format(
                    item['contentid']),
                              method='GET',
                              callback=self.content_parse,
                              headers=self.headers,
                              meta={'id': item['contentid']},
                              dont_filter=True)

    def zhuanti_parse(self, response):
        links = response.css('.g-burst-left li .ig a::attr(href)').extract()
        for li in links:
            if re.search('news.yorkbbs.ca', li) == None: continue
            id = re.findall('\d+', li)[0]
            yield Request(url='https://news.yorkbbs.ca/{}'.format(id),
                          method='GET',
                          callback=self.content_parse,
                          headers=self.headers,
                          meta={'id': id},
                          dont_filter=True)

    def content_parse(self, response):
        date = response.xpath(
            '//div[@class="fl times"]/text()').extract_first()
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor4Item()

        pipleitem['date'] = date
        pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys(
        ) else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = re.sub(
            '来源:', '',
            response.xpath('//div[@class="fl origin"]/text()').extract_first())
        pipleitem['content'] = helper.list2str(
            response.css('.news-detail-cont').xpath('string(.)').extract())
        pipleitem['editor'] = None
        pipleitem['views'] = None
        pipleitem['image_urls'] = helper.list2str(
            response.css('.news-detail-cont img::attr(src)').extract())
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = response.css(
            '#support .num-total::text').extract_first()
        pipleitem['dislike'] = response.css(
            '#against .num-total::text').extract_first()
        self.commentheaders['Referer'] = response.url
        self.commentpar['articleId'] = response.meta['id']
        html = requests.post(
            url='https://comment.yorkbbs.ca/api/comment/getComment',
            data=self.commentpar,
            headers=self.commentheaders)
        pipleitem['comment'] = json.loads(html.text)['totalCount']
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
class AppLeparisienSpider(scrapy.Spider):
    name = 'app.leparisien'
    limittime = settings.get('CRAWL_START_DATE')

    entry_point = {
        'politique': 'http://www.leparisien.fr/politique/',
        'economie': 'http://www.leparisien.fr/economie/',
        'societe': 'http://www.leparisien.fr/societe/'
    }

    headers = {
        'Accept':
        '*/*',
        'Host':
        'www.leparisien.fr',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.9,en;q=0.8',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }

    def start_requests(self):
        for key in self.entry_point.keys():
            yield Request(url=self.entry_point.get(key),
                          method='GET',
                          callback=self.parse,
                          headers=self.headers,
                          dont_filter=True)

    def parse(self, response):
        links = response.css('a::attr(href)').extract()
        for link in set(links):
            if link == None or re.search(
                    'www.leparisien.fr', link) == None or re.search(
                        '\d{2}-\d{2}-\d{4}', link) == None:
                continue
            if re.match('(https|http):', link) == None: link = 'http:' + link
            yield Request(url=link,
                          method='GET',
                          callback=self.content_parse,
                          headers=self.headers)

    def content_parse(self, response):
        date = response.xpath(
            '//meta[@property="article:published_time"]/@content'
        ).extract_first()
        if date == None or len(date) == 0: return
        date = re.findall('\d+[-:]\d+[-:]*\d*', date)

        try:
            if helper.compare_time(helper.list2str(date), self.limittime) < 0:
                return
        except:
            return

        pipleitem = CctvOpinionmonitor5Item()

        id = re.findall('\d{7,}', response.url)
        pipleitem['date'] = helper.list2str(date)
        pipleitem['id'] = id[0] if len(id) > 0 else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = response.xpath(
            '//span[@class="margin_top_sm ui_bold"]/text()').extract_first()
        pipleitem['content'] = helper.list2str(
            response.xpath(
                'string(//div[@class="article-section margin_bottom_article"])'
            ).extract())
        pipleitem['editor'] = response.xpath(
            '//span[@class="margin_top_sm ui_bold"]/text()').extract_first()
        pipleitem['views'] = None
        pipleitem['image_urls'] = helper.list2str(
            input=response.css('article.grid img::attr(src)').extract(),
            prefix='http://www.leparisien.fr')
        pipleitem['video_urls'] = helper.list2str(
            response.xpath('//iframe[@allow="autoplay"]/@src').extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
示例#55
0
 def __init__(self):
     connection = MongoClient(settings.get('MONGODB_URI'))
     db = connection[settings['MONGODB_DATABASE']]
     # db.authenticate(settings['MONGODB_USERNAME'], settings['MONGODB_PASSWORD'])
     self.collection = db[settings['CRAWLER_COLLECTION']]
示例#56
0
 def process_request(self, request, spider):
     request.headers['cookie'] = settings.get('BOSS_COOKIE')
class ChinatimesComSpider(scrapy.Spider):
    name = 'chinatimes.com'
    limittime = settings.get('CRAWL_START_DATE')

    # 'https://www.chinatimes.com/politic/total/'
    entry_point = {
        '政治': 'politic',
        '财经': 'money',
        '国际': 'world',
        '两岸': 'chinese',
        '军事': 'armament',
        '社会': 'society',
        '言论': 'opinion',
    }

    headers = {
        'authority': "www.chinatimes.com",
        'upgrade-insecure-requests': "1",
        'user-agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Mobile Safari/537.36",
        'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
        'referer': "https://www.chinatimes.com/opinion/",
        'accept-encoding': "gzip, deflate, br",
        'accept-language': "zh-CN,zh;q=0.9,en;q=0.8,co;q=0.7",
        'Cache-Control': "no-cache",
        'Host': "www.chinatimes.com",
    }

    # 'https://www.chinatimes.com/opinion/PageListTotal/?page=2&_=1561981086807'
    def start_requests(self):
        for key in self.entry_point.keys():
            yield Request(url='https://www.chinatimes.com/{}/PageListTotal/?page=1'.format(self.entry_point[key]), method='GET',
                          callback=self.parse, headers=self.headers,
                          dont_filter=True)

    def parse(self, response):
        jsbd = json.loads(response.text)
        for item in jsbd['list'] if 'list' in jsbd.keys() else []:
            if 'HyperLink' not in item.keys() or len(str(item['HyperLink'])) == 0: continue
            date = '{date} {time}'.format(date=item['ArticleDate'],time=item['ArticleTime'])
            date = helper.formatTime(date)
            id = item['Id'] if 'Id' in item.keys() else None
            yield Request(url='https://www.chinatimes.com{}'.format(item['HyperLink']), method='GET',
                          callback=self.content_parse, headers=self.headers,
                          meta={'date': date, 'id': id})

    # 'https://www.chinatimes.com/realtimenews/20190701003505-260407'
    def content_parse(self, response):
        date = response.meta['date'] if 'date' in response.meta.keys() else None
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor4Item()

        pipleitem['date'] = date
        pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys() else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('.title::text').extract_first()
        pipleitem['source'] = response.xpath('//div[@name="source"]/@content').extract_first()
        pipleitem['content'] = helper.list2str(response.css('.article-body p').xpath('string(.)').extract())
        pipleitem['editor'] = response.css('.author a::text').extract_first()
        pipleitem['views'] = None
        pipleitem['image_urls'] = helper.list2str(response.css('img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(response.css('video::attr(src)').extract())
        pipleitem['share'] =  None
        pipleitem['like'] =  None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
示例#58
0
class CaseNumberSpider(scrapy.Spider):
    name = 'casenumber'
    allowed_domains = ['www.itslaw.com']
    custom_settings = {
        "LOG_LEVEL": "DEBUG",
        # "DOWNLOAD_TIMEOUT": 5,
        # "DOWNLOAD_DELAY": 0.2,
        "DOWNLOADER_MIDDLEWARES": {
            # 'itslaw.middlewares.ProxyMiddleware': 543,
            "itslaw.middlewares.ItslawDownloaderMiddleware": 534
        },
        "DEFAULT_REQUEST_HEADERS": {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36",
            "Referer":
            "https://www.itslaw.com/search?searchMode=judgements&sortType=1&conditions=trialYear%2B1994%2B7%2B1994",
            # "Cookie": "_t=0e9084b2-59b6-4cab-985f-be99b553e944; sessionId=49f99a6a-99e0-438a-8181-f3757aa8e267; LXB_REFER=mail.qq.com; _u=f0c76f8f-8df1-4e56-832a-7aa5fe6118c4; Hm_lvt_bc6f194cb44b24b9f44f1c8766c28008=1554555977,1554601580,1554601590,1554601609; Hm_lvt_e496ad63f9a0581b5e13ab0975484c5c=1554555977,1554601580,1554601591,1554601609; _i=bf039e9d-6188-4c4a-8bce-b4bd757b6b67; _p=032dd594-483e-4ec8-ab62-773cf754fdb9; Hm_lpvt_e496ad63f9a0581b5e13ab0975484c5c=1554601618; Hm_lpvt_bc6f194cb44b24b9f44f1c8766c28008=1554601618",
        },
        "ITEM_PIPELINES": {
            'itslaw.pipelines.ConditionPipeline': 300,
        }
    }
    settings = get_project_settings()
    redis_host = settings.get("REDIS_HOST")
    redis_port = settings.get("REDIS_PORT")
    proxy_server = settings.get("PROXY_SERVER")
    proxy_user = settings.get("PROXY_USER")
    proxy_pass = settings.get("PROXY_PASS")
    proxy_auth = "Basic " + base64.urlsafe_b64encode(
        bytes((proxy_user + ":" + proxy_pass), "ascii")).decode("utf8")
    pool = ConnectionPool(host=redis_host, port=redis_port, db=0)
    r = Redis(connection_pool=pool)
    count = os.getenv("COUNT", "")
    key = f'conditions:case{count}'

    # key = f'conditions:error'

    def start_requests(self):
        # $env:COUNT=""
        self.name += self.count
        while True:
            self.r.sdiffstore(self.key, self.key, "conditions:crawled")
            self.r.sdiffstore(self.key, self.key, "conditions:pages")
            self.r.sdiffstore(self.key, self.key, "conditions:beyond")
            left = self.r.sdiffstore(self.key, self.key, "conditions:noresult")
            self.logger.info(f"left {left} condition combinations to crawl.")
            urls = self.r.srandmember(self.key, number=10000)
            if not urls:
                break
            for url in urls:
                yield Request(str(url, encoding="utf-8"), dont_filter=True)

    def parse(self, response):
        url = response.url
        res = json.loads(response.body_as_unicode())
        code = res["result"]["code"]
        message = res["result"]["message"]
        self.logger.debug(message)

        if 0 != code:
            error_essage = res["result"]["errorMessage"]
            self.logger.debug(error_essage)
            return

        try:
            data = res["data"]
        except Exception as e:
            self.logger.debug(e)
            yield Request(url=response.url, dont_filter=True)
            return

        searchResult = data["searchResult"]
        total_count = searchResult["totalCount"]
        if 0 == total_count:
            self.r.sadd("conditions:noresult", url)
        elif total_count <= 20:
            judgements = searchResult["judgements"]
            for each in judgements:
                jid = each["id"]
                yield JudgementItem(id=jid)
            else:
                self.r.sadd("conditions:crawled", url)
        elif total_count <= 400:
            self.r.sadd("conditions:pages", url)
        else:
            self.r.sadd("conditions:beyond", url)
示例#59
0
    def __init__(self, *args, **kwargs):
        kwargs['fields_to_export'] = settings.getlist('EXPORT_FIELDS') or None
        kwargs['encoding'] = settings.get('EXPORT_ENCODING', 'utf-8')

        super(CSVkwItemExporter, self).__init__(*args, **kwargs)
示例#60
0
class ConditionSpider(scrapy.Spider):
    name = 'condition'
    allowed_domains = ['www.itslaw.com']
    custom_settings = {
        # "LOG_LEVEL": "DEBUG",
        "DOWNLOAD_TIMEOUT": 5,
        # "DOWNLOAD_DELAY": 0.2,
        "DOWNLOADER_MIDDLEWARES": {
            'itslaw.middlewares.ProxyMiddleware': 543,
            # "itslaw.middlewares.ItslawDownloaderMiddleware": 534
        },
        "DEFAULT_REQUEST_HEADERS": {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36", 
            "Referer": "https://www.itslaw.com/search?searchMode=judgements&sortType=1&conditions=trialYear%2B1994%2B7%2B1994",
            "Cookie": "_t=0e9084b2-59b6-4cab-985f-be99b553e944; showSubSiteTip=false; subSiteCode=bj; LXB_REFER=www.wusong.com; Hm_lvt_bc6f194cb44b24b9f44f1c8766c28008=1555339418,1555339440,1555339451; Hm_lvt_e496ad63f9a0581b5e13ab0975484c5c=1555339418,1555339440,1555339451; sessionId=53b834b2-5dc8-4be5-889f-c5c425f51fc6; _u=8768e601-6c73-4ff3-941a-99f77f09b573; Hm_lpvt_bc6f194cb44b24b9f44f1c8766c28008=1557581284; Hm_lpvt_e496ad63f9a0581b5e13ab0975484c5c=1557581284",
        },
        "ITEM_PIPELINES": {
            'itslaw.pipelines.ConditionPipeline': 300,
        }
    }
    settings = get_project_settings()
    redis_host = settings.get("REDIS_HOST")
    redis_port = settings.get("REDIS_PORT")
    proxy_server = settings.get("PROXY_SERVER")
    proxy_user = settings.get("PROXY_USER")
    proxy_pass = settings.get("PROXY_PASS")
    proxy_auth = "Basic " + base64.urlsafe_b64encode(bytes((proxy_user + ":" + proxy_pass), "ascii")).decode("utf8")
    pool = ConnectionPool(host=redis_host, port=redis_port, db=0)
    r = Redis(connection_pool=pool)
    key = f'condition:searchword{os.getenv("COUNT", "")}'
    # key = f'conditions:error'
    
    def start_requests(self):
        # $env:COUNT=""
        while True:
            left = self.r.sdiffstore(self.key, self.key, "condition:crawled")
            self.logger.info(f"[*] left {left} condition combinations to crawl.")
            urls = self.r.srandmember(self.key, number=10000)
            if not urls:
                break
            for url in urls:
                yield Request(str(url, encoding="utf-8"), dont_filter=True)

    def parse(self, response):
        url = response.url
        try:
            res = json.loads(response.body_as_unicode())
        except Exception as e:
            return    
        code = res["result"]["code"]
        message = res["result"]["message"]
        self.logger.debug(message)
        
        if 0 != code:
            error_essage = res["result"]["errorMessage"]
            self.logger.debug(error_essage)
            self.r.sadd("condition:error", response.url)   
            return

        try:
            data = res["data"]
        except Exception as e:
            self.r.sadd("condition:error", response.url)
            self.logger.debug(e)
            yield Request(url=response.url, dont_filter=True)
            return

        searchResult = data["searchResult"]
        judgements = searchResult.get("judgements", [])

        for each in judgements:
            jid = each["id"]
            yield JudgementItem(id=jid)
        else:
            self.r.sadd("condition:crawled", url)