def test_tags(self): html = """<html><area href="sample1.html"></area><a href="sample2.html">sample 2</a><img src="sample2.jpg"/></html>""" response = HtmlResponse("http://example.com/index.html", body=html) lx = SgmlLinkExtractor(tags=None) self.assertEqual(lx.extract_links(response), []) lx = SgmlLinkExtractor() self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), ]) lx = SgmlLinkExtractor(tags="area") self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample1.html', text=u''), ]) lx = SgmlLinkExtractor(tags="a") self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample2.html', text=u'sample 2'), ]) lx = SgmlLinkExtractor(tags=("a","img"), attrs=("href", "src"), deny_extensions=()) self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample2.jpg', text=u''), ])
def test_encoded_url_in_restricted_xpath(self): body = """<html><body><div><a href="?page=2">BinB</a></body></html>""" response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8') lx = SgmlLinkExtractor(restrict_xpaths="//div") self.assertEqual(lx.extract_links(response), [ Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False), ])
def parse_session_hash(self, response): extractor = SgmlLinkExtractor( allow=r'/w/valikko\.jsp', tags='frame', attrs=('src', )) link = extractor.extract_links(response)[0] query = urlparse.urlparse(link.url).query params = urlparse.parse_qs(query) return params['MD5avain'][0]
def test_deny_extensions(self): html = """<a href="page.html">asd</a> and <a href="photo.jpg">""" response = HtmlResponse("http://example.org/", body=html) lx = SgmlLinkExtractor() self.assertEqual(lx.extract_links(response), [ Link(url='http://example.org/page.html', text=u'asd'), ])
def __init__(self, allow = (), deny = (), allow_domains = (), deny_domains = (), restrict_xpaths = (), tags = ('a', 'area'), attrs = ('href'), canonicalize = True, unique = True, process_value = None, check_url = True): #Add check_url parameter self.check_url = check_url SgmlLinkExtractor.__init__(self, allow = allow, deny = deny, allow_domains = allow_domains, deny_domains = deny_domains, restrict_xpaths = restrict_xpaths, tags = tags, attrs = attrs, canonicalize = canonicalize, unique = unique, process_value = process_value)
def parse_hospital_active_doctor(self, response): """ This function parses a sample response. Some contracts are mingled with this docstring. @url http://www.haodf.com/hospital/DE4roiYGYZwXhYmS30yF9V0wc/DE4rO-XCoLU0Jq1rbc1P6dS2aO/daifu.htm @returns items 14 14 @returns requests 20 100 @scrapes _name hospital specialty title reply2wCount """ hxs = HtmlXPathSelector(response) city = response.meta['city'] area = response.meta['area'] print "$$$ current city: %s area: %s" % (city[0], area[0]) #Sample #http://www.haodf.com/hospital/DE4roiYGYZwXhYmS30yF9V0wc/DE4rO-XCoLUE-578VWVmvC3uh7/daifu.htm linkExtractor = SgmlLinkExtractor(allow=(r"/hospital/\S+/\S+/daifu.htm",), unique=True) links = linkExtractor.extract_links(response) for link in links: request = Request(link.url, callback=self.parse_hospital_active_doctor) request.meta['city'] = response.meta['city'] request.meta["area"] = response.meta['area'] yield request hospital = hxs.select("/html/body/div[3]/div/a[3]/text()").extract()[0] print hospital specialty = hxs.select("//div[@class='subnav']/a/text()").re(r'(\S+)\s+(\S+)')[0] print specialty docLinks = hxs.select("//table[@id='doc_list_index']/tr[descendant::td[contains(@class, 'tda')]]") #docLinks = hxs.select("//table[@id='doc_list_index']/tr") for doc in docLinks: l = XPathItemLoader(ActiveDoctorItem(), doc) docNames = doc.select("./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()").extract() if len(docNames) != 0: print docNames[0] l.add_xpath('_name', "./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()") l.add_value('specialty', specialty) l.add_value('hospital', hospital) l.add_value('city', response.meta['city']) l.add_value('area', response.meta['area']) title = doc.select("./td[@class='tda']/li/text()").re('\S+') if len(title) == 1: l.add_value('title', title[0]) l.add_xpath('count_ReplyInTwoWeeks', u"./td[@class='td_hf']/div[contains(text(), '近2周回复咨询')]/span/text()") l.add_xpath('count_ReplyTotal', u"./td[@class='td_hf']/div[contains(text(), '总共回复')]/span/text()") l.add_xpath('count_Calls', u"./td[@class='td_hf']/div[contains(text(), '已接听电话咨询')]/span/text()") ret = l.load_item() #print ret yield ret
def parseL2(self, response): # forums - liks to lists and to threads s2 = SgmlLinkExtractor(restrict_xpaths=['//table[@class="forums-list"]/tr/td/a']) Links = s2.extract_links(response) for l in Links: yield Request(l.url, callback=self.parseL3) self.scrapeTheadURL(response)
def parse(self, response): # title page hxs = HtmlXPathSelector(response) s1 = SgmlLinkExtractor(restrict_xpaths=['//a[@class="title"]']) Links = s1.extract_links(response) for l in Links: yield Request(l.url, callback=self.parseL2)
def parseThread(self, response): print('inside a thread') hxs = HtmlXPathSelector(response) filename = "xxx"+response.url.split("/")[-2][1:] with open(filename, 'a') as f: for entry in hxs.select('//div[contains(@class,"forums-thread")]'): msgID= entry.select('span/@id').extract()[0] msgDate= entry.select('h4/text()').extract()[0].encode('ascii','ignore').replace('\n','') msgText=' '.join(entry.select('span/text()').extract()).encode('ascii','ignore').replace('\n','') try: mgAuthor= entry.select('h3/span/a/text()').extract()[0].encode('ascii','ignore').replace('\n','') except: mgAuthor='none' try: msgTitle= entry.select('h3/strong/text()').extract()[0].encode('ascii','ignore').replace('\n','') except: msgTitle="none" f.write('msgID:'+msgID+'\n') f.write('msgTitle:'+msgTitle+'\n') f.write('mgAuthor:'+mgAuthor+'\n') f.write('msgDate:'+msgDate+'\n') f.write('msgText:'+msgText+'\n\n') s = SgmlLinkExtractor(restrict_xpaths=['//li[contains(@class, "next")]']) Links = s.extract_links(response) if len(Links) > 0: print 'going to the next page' yield Request(Links[0].url, callback=self.parseThread)
def test_attrs(self): lx = self.extractor_cls(attrs="href") self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), ]) lx = self.extractor_cls(attrs=("href","src"), tags=("a","area","img"), deny_extensions=()) self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample2.jpg', text=u''), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), ]) lx = self.extractor_cls(attrs=None) self.assertEqual(lx.extract_links(self.response), []) html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>""" response = HtmlResponse("http://example.com/index.html", body=html) lx = SgmlLinkExtractor(attrs=("href")) self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample1.html', text=u''), ])
def test_restrict_xpaths_concat_in_handle_data(self): """html entities cause SGMLParser to call handle_data hook twice""" body = """<html><body><div><a href="/foo">>\xbe\xa9<\xb6\xab</a></body></html>""" response = HtmlResponse("http://example.org", body=body, encoding='gb18030') lx = SgmlLinkExtractor(restrict_xpaths="//div") self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c', fragment='', nofollow=False)])
def test_base_url_with_restrict_xpaths(self): html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" /> <body><p><a href="item/12.html">Item 12</a></p> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html) lx = SgmlLinkExtractor(restrict_xpaths="//p") self.assertEqual(lx.extract_links(response), [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None, deny_extensions=None, seen_urls=[]): SgmlLinkExtractor.__init__(self,allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths, tags=tags, attrs=attrs, canonicalize=canonicalize, unique=unique, process_value=process_value, deny_extensions=deny_extensions) for l in seen_urls: self.seen_urls[l]=True
def test_restrict_xpaths(self): lx = SgmlLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]',)) self.assertEqual( [link for link in lx.extract_links(self.response)], [ Link(url="http://example.com/sample1.html", text=u""), Link(url="http://example.com/sample2.html", text=u"sample 2"), ], )
def parseL3(self, response): # like model specific self.scrapeTheadURL(response) # multipage s = SgmlLinkExtractor(restrict_xpaths=['//li[contains(@class, "next")]']) Links = s.extract_links(response) if len(Links) > 0: yield Request(Links[0].url, callback=self.parseL3)
def extract_links(self, response, **extra): # {{{ """ Extract links from response extra - passed to SgmlLinkExtractor """ link_extractor = SgmlLinkExtractor(**extra) links = link_extractor.extract_links(response) return links
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None, ignore_set=set()): self.ignore_set = ignore_set SgmlLinkExtractor.__init__(self, allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths, tags=tags, attrs=attrs, canonicalize=canonicalize, unique=unique, process_value=process_value)
def test_link_nofollow(self): html = """ <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a> <a href="about.html">About us</a> """ response = HtmlResponse("http://example.org/page.html", body=html) lx = SgmlLinkExtractor() self.assertEqual([link for link in lx.extract_links(response)], [ Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True), Link(url='http://example.org/about.html', text=u'About us', nofollow=False) ])
def parse(self, response): print "IN PARSE!" # inspect_response(response,self) links=SgmlLinkExtractor( allow=('https://www.coursera.org/course/\w+'), ) print "TAMANHO:",len(links.extract_links(response)) for link in links.extract_links(response): # print link yield Request(link.url,callback=self.parse_item)
def parse(self, response): # changed to parse to crawl all home page lx = SgmlLinkExtractor() urls = lx.extract_links(response) noworder = 0 for oneurl in urls: noworder += 1 yield scrapy.Request( oneurl.url, callback=lambda response, crawllevel=1, order=noworder, loopstr="": self.parse_text( response, crawllevel, order, loopstr ), )
def parse_testfile(self, response): lx = SgmlLinkExtractor() urls = lx.extract_links(response) readed = 0 notreaded = 0 for oneurl in urls: handle = OpenMD5File(oneurl.url, "rb") if handle == False: notreaded += 1 else: readed += 1 handle.close() print readed, notreaded
def parse_start_url(self, response): if not hasattr(response, 'encoding'): setattr(response, 'encoding', 'text/html;charset=UTF-8') target_le = SgmlLinkExtractor( allow=r'/cn/products/products_detail.asp\?Catalog_id=\w+') links = target_le.extract_links(response) if links: return [Request(url=link.url, cookies=self.forged_cookie, callback=self.parse_item) for link in links] else: general_le = SgmlLinkExtractor( allow=()) return [Request(url=link.url, cookies=self.forged_cookie) for link in general_le.extract_links(response)]
def test_extraction_using_single_values(self): '''Test the extractor's behaviour among different situations''' lx = SgmlLinkExtractor(allow='sample') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), ]) lx = SgmlLinkExtractor(allow='sample', deny='3') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), ]) lx = SgmlLinkExtractor(allow_domains='google.com') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u''), ]) lx = SgmlLinkExtractor(deny_domains='example.com') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u''), ])
def test_extraction_using_single_values(self): """Test the extractor's behaviour among different situations""" lx = SgmlLinkExtractor(allow="sample") self.assertEqual( [link for link in lx.extract_links(self.response)], [ Link(url="http://example.com/sample1.html", text=u""), Link(url="http://example.com/sample2.html", text=u"sample 2"), Link(url="http://example.com/sample3.html", text=u"sample 3 text"), ], ) lx = SgmlLinkExtractor(allow="sample", deny="3") self.assertEqual( [link for link in lx.extract_links(self.response)], [ Link(url="http://example.com/sample1.html", text=u""), Link(url="http://example.com/sample2.html", text=u"sample 2"), ], ) lx = SgmlLinkExtractor(allow_domains="google.com") self.assertEqual( [link for link in lx.extract_links(self.response)], [Link(url="http://www.google.com/something", text=u"")] ) lx = SgmlLinkExtractor(deny_domains="example.com") self.assertEqual( [link for link in lx.extract_links(self.response)], [Link(url="http://www.google.com/something", text=u"")] )
def parse(self, response): print('inside a thread') hxs = HtmlXPathSelector(response) filename_ = response.url.split("/")[-2][1:] filename= os.path.abspath(databasePath+ "\data\%s" % filename_) dumpFilePath = os.path.abspath(databasePath+ "\dump\%s" % filename_) try: a = response.meta['page'] except KeyError: a=0 os.mkdir(dumpFilePath) with open(filename, 'a') as f: #header forumTitle=hxs.select('//div[@class="module forums"]/h2/text()').extract()[0].encode('ascii','ignore').replace('\n','') extraInfo=hxs.select('//div[@class="module forums discussion tid"]/h4/text()').extract()[0].encode('ascii','ignore').replace('\n','') f.write("title:"+forumTitle+"\n") f.write("extraInfo:"+extraInfo+"\n") f.write(response.url+"\n") f.write(filename+"\n") f.write(dumpFilePath+"\n\n") with open(dumpFilePath+ "\\" +str(a)+'.html', 'a') as fd: fd.write(response.body) with open(filename, 'a') as f: for entry in hxs.select('//div[contains(@class,"forums-thread")]'): msgID= entry.select('span/@id').extract()[0] msgDate= entry.select('h4/text()').extract()[0].encode('ascii','ignore').replace('\n','') msgText=' '.join(entry.select('span/text()').extract()).encode('ascii','ignore').replace('\n','') try: mgAuthor= entry.select('h3/span/a/text()').extract()[0].encode('ascii','ignore').replace('\n','') except: mgAuthor='none' try: msgTitle= entry.select('h3/strong/text()').extract()[0].encode('ascii','ignore').replace('\n','') except: msgTitle="none" f.write('msgID:'+msgID+'\n') f.write('msgTitle:'+msgTitle+'\n') f.write('mgAuthor:'+mgAuthor+'\n') f.write('msgDate:'+msgDate+'\n') f.write('msgText:'+msgText+'\n\n') s = SgmlLinkExtractor(restrict_xpaths=['//li[contains(@class, "next")]']) Links = s.extract_links(response) if len(Links) > 0: print 'going to the next page' r = Request(googc+Links[0].url, callback=self.parse) r.meta['page']=a+1; yield r
def test_restrict_xpaths_encoding(self): """Test restrict_xpaths with encodings""" html = """<html><head><title>Page title<title> <body><p><a href="item/12.html">Item 12</a></p> <div class='links'> <p><a href="/about.html">About us\xa3</a></p> </div> <div> <p><a href="/nofollow.html">This shouldn't be followed</a></p> </div> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding="windows-1252") lx = SgmlLinkExtractor(restrict_xpaths="//div[@class='links']") self.assertEqual(lx.extract_links(response), [Link(url="http://example.org/about.html", text=u"About us\xa3")])
def test_process_value(self): """Test restrict_xpaths with encodings""" html = """ <a href="javascript:goToPage('../other/page.html','photo','width=600,height=540,scrollbars'); return false">Link text</a> <a href="/about.html">About us</a> """ response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding="windows-1252") def process_value(value): m = re.search("javascript:goToPage\('(.*?)'", value) if m: return m.group(1) lx = SgmlLinkExtractor(process_value=process_value) self.assertEqual(lx.extract_links(response), [Link(url="http://example.org/other/page.html", text="Link text")])
def parse_brands(self,response): lx = SgmlLinkExtractor(restrict_xpaths=('//td[@valign="top"]'), allow=('\S+\.com'), unique=True) links = lx.extract_links(response) brands_all = set(sorted(link.text for link in links)) self.log(u'Extracted {} brands.'.format(len(brands_all)), scrapy.log.DEBUG) """Traverse through all the pages to get all products""" """brands_alphabets = ['A','B','C','D','E','F','G','H','I', 'J','K','L','M','N','O','P','Q','R', 'S','T','U','V','W','X','Y','Z']""" brands_alphabets = ['A'] for alpha in brands_alphabets: yield Request(self.url_view_items + str(alpha), callback=self.items_list)
def crawl_all(self, response): print 'Crawling all...' # Get list of decks self.deck_links = SgmlLinkExtractor(allow = r'/sets/\d+').extract_links(response) return self.parse_deck_links(None)
def parsePage(self, response): hxs = HtmlXPathSelector(response) item = response.meta['item'] emails = collectAllEmail(hxs.extract()) if len(emails) > 0: item['email'] = emails[0] yield item extractor = SgmlLinkExtractor(allow_domains=response.url) for entry in extractor.extract_links(response): if entry.url is not None: req = Request(entry.url, callback=self.parsePage) req.meta['item'] = item yield req
class HideMyAssSpider(CrawlSpider): name = 'hidemyass' start_urls = ['http://hidemyass.com/proxy-list/'] allowed_domains = ['hidemyass.com'] rules = (Rule(SgmlLinkExtractor(restrict_xpaths=( '//div[@id="container"]//div[@id="pagination"]/ul/div/li[@class="nextpageactive"]/a' )), callback='parse', follow=True), ) def parse(self, response): self.log('No item received for %s' % response.url) for elem in super(HideMyAssSpider, self).parse(response): yield elem hxs = HtmlXPathSelector(response) links = hxs.select('//tr[@class="altshade"]') for link in links: ipaddress_parts = link.select('td[2]/span') style_text = ipaddress_parts.select('style/text()').extract() style_text = style_text[0].split('\n') display_none = [ style[1:style.index('{')] for style in style_text if 'none' in style ] display_inline = [ style[1:style.index('{')] for style in style_text if 'inline' in style ] display_none = set(display_none) display_inline = set(display_inline) ipaddress = [] for ipaddress_part in ipaddress_parts.select('span|div|text()'): tag_class = tag_style = tag_name = None try: tag_class = ipaddress_part.select('@class').extract() except TypeError: # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult) pass try: tag_style = ipaddress_part.select('@style').extract() except TypeError: # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult) pass try: tag_name = ipaddress_part.select("name()") except TypeError: # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult) pass if tag_name: tag_text = ipaddress_part.select('text()').extract() else: tag_text = ipaddress_part.extract() if tag_style and 'none' in tag_style[0]: continue if tag_class and tag_class[0] in display_none: continue if isinstance(tag_text, list): tag_text = ''.join(tag_text) tag_texts = tag_text.split('.') for tag_text in tag_texts: tag_text = tag_text.strip() if not tag_text.isdigit(): continue ipaddress.append(tag_text) ipaddress = '.'.join(ipaddress) loader = WebsiteLoader(selector=link) loader.add_value('ipaddress', ipaddress) loader.add_xpath('port', 'td[3]/text()') loader.add_xpath('country', 'td[4]/span/text()') loader.add_xpath('_type', 'td[7]/text()') loader.add_xpath('anonimity', 'td[8]/text()') loader.add_value('url', response.url) item = loader.load_item() yield item
class KompasCrawler(NewsBaseCrawler): # Identifier name = 'kompas' source = 'kompas.com' # Debug debug = False # Rules allowed_domains = ['kompas.com'] start_urls = [ 'http://www1.kompas.com/newsindex/secidx/1/nasional/', ] rules = (Rule(SgmlLinkExtractor(allow=('/read/', ), unique=True), follow=True, callback='parse_item'), ) # XPath xpath_title = '//div[@class="judul_artikel2011"]' xpath_subtitle = '//div[@class="font11 c_orange_kompas2011 pb_5 pt_5"]' xpath_category = '//div[@class="menu_kompas"]/ul/li/a[@class="selected"]' xpath_author = '//none' xpath_published_at = '(//div[@class="font11 c_abu03_kompas2011 pb_3"]/span[@class="c_abu01_kompas2011"])[last()]' xpath_place = '//div[@class="isi_berita2011 pt_5"]/p/strong' xpath_content = '//p' # Overriden methods def parse_date(self, date_str): split_str = date_str.split(' ') year = split_str[3] if split_str[2] == 'Januari': month = '01' elif split_str[2] == 'Pebruari' or split_str[2] == 'Februari': month = '02' elif split_str[2] == 'Maret': month = '03' elif split_str[2] == 'April': month = '04' elif split_str[2] == 'Mei': month = '05' elif split_str[2] == 'Juni': month = '06' elif split_str[2] == 'Juli': month = '07' elif split_str[2] == 'Agustus': month = '08' elif split_str[2] == 'September': month = '09' elif split_str[2] == 'Oktober': month = '10' elif split_str[2] == 'November' or split_str[2] == 'Nopember': month = '11' elif split_str[2] == 'Desember': month = '12' else: month = '01' if split_str[1] < 10: day = split_str[1] else: day = "0" + split_str[1] time = split_str[5] return "%s-%s-%s %s:00" % (year, month, day, time) def parse_place(self, place_str): split_str = place_str.split(',') return split_str[0] def normalize_category(self, category_str): if category_str.lower() in ('nasional', 'regional', 'megapolitan'): return self.CATEGORY_NATIONAL elif category_str.lower() in ('internasional'): return self.CATEGORY_INTERNATIONAL elif category_str.lower() in ('bisniskeuangan'): return self.CATEGORY_ECONOMY elif category_str.lower() in ('olahraga'): return self.CATEGORY_SPORTS elif category_str.lower() in ('sains'): return self.CATEGORY_SCITECH elif category_str.lower() in ('travel', 'oase', 'edukasi'): return self.CATEGORY_HUMANIORA else: return self.CATEGORY_OTHERS
class ExampleSpider(CrawlSpider): name = 'example.com' start_urls = ['https://app1.com/users/home.php'] item_urls = [] # urlpool = [] # 'log' and 'pwd' are names of the username and password fields # depends on each website, you'll have to change those fields properly # one may use loginform lib https://github.com/scrapy/loginform to make it easier # when handling multiple credentials from multiple sites. rules = ( Rule(SgmlLinkExtractor(allow=r'-\w+.html$'), callback='parse_page', follow=True), ) def init_request(self): return [Request(url='https://app1.com/users/home.php', callback=self.login)] def login(self, response): print "hell" return FormRequest.from_response(response, formdata={'username': '******', 'password': '******'}, callback=self.after_login) def after_login(self, response): # check login succeed before going on if "ERROR: Invalid username" in response.body: self.log("Login failed", level=log.ERROR) return # continue scraping with authenticated session... else: self.log("Login succeed!", level=log.DEBUG) print "Logging in" print "asdsdsd " + response.url return Request(url='https://app1.com/users/home.php', callback=self.parse_page) self.initialized() # example of crawling all other urls in the site with the same # authenticated session. def isURLinPool(self, url): for t in self.item_urls: if (t.find("?") != -1): t = t[:t.find("?")] if (url.find("?") != -1): url = url[:url.find("?")] if url.lower() == t.lower(): return False return True def parse_page(self, response): """ Scrape useful stuff from page, and spawn new requests """ hxs = HtmlXPathSelector(response) # i = CrawlerItem() # find all the link in the <a href> tag input_box = hxs.select('//input/@name').extract() print "Scraping the URL " + response.url for inputs in input_box: print "The input boxes with name " + inputs links = hxs.select('//a/@href').extract() input_box = hxs.select('//input/@src').extract() # print "Scraping the URL " + response.url for inputs in input_box: print "The input boxes with src " + inputs print "\n" # Yield a new request for each link we found # #this may lead to infinite crawling... ur1 = "" for link in links: url = "https://app1.com"+link if(link.find(":") != -1): continue if(self.isURLinPool(url)): print "THIS IS A LINK " + link # yield Request(url= "https://app1.com"+link, callback=self.parse_page) #only process external/full link link = url ur1 = link if link.find("http") > -1: print "Before Sending it for parse " + link yield Request(url=link, callback=self.parse_page) self.item_urls.append(response.url) item = LinkItem() item["title"] = hxs.select('//title/text()').extract()[0] item["url"] = response.url # self.item_urls["title"] = hxs.select('//title/text()').extract()[0] # self.item_urls["url"] = response.url yield self.collect_item(item) def collect_item(self, item): return item
class ApprenticeSpider(CrawlSpider): name = 'apprentice' start_urls = START_URLS rules = ( Rule(SgmlLinkExtractor(unique=True), callback='parse_item', follow=False), #should it be unique ) crawled_urls = [] print 'Loading critic...' print "path", os.path.join(DIRNAME, '../../../../data/classifier.pickle') supervisor = read_classifier( os.path.join(DIRNAME, '../../../../data/classifier.pickle')) print 'Critic loaded...' apprentice = NBClassifier() def parse_item(self, response): """ crawling the webpage and extracts the url. Once the crawling is done, evaluate the page content and enter this function again to train the apprentice """ ApprenticeSpider.crawled_urls.append(response.url) # if response.meta.has_key ('train_flag') and response.meta ['train_flag']: #entering the train mode # print "training the apprentice" #html to words words = html2words(response.body) probs = ApprenticeSpider.supervisor.predict(words) interestness = probs['pos'] #use the score to train the apprentice using the surrouding (word, offset) pairs # print "word_offset_pairs = ", response.meta ['word_offset_pairs'] # print "interestness of %s = %f" %(response.url, interestness) if response.meta.has_key('word_offset_pairs'): # ApprenticeSpider.apprentice.train([ (response.meta['word_offset_pairs'], interestness > 0.5 and "pos" or "neg") ]) item = UrlItem() item['url'] = response.url item['interestness'] = interestness yield item # else: # print "fetching the urls" url_infos = scrape_url_and_words(response.body, response.url, level=3) for url_info in url_infos: url, word_offset_pairs = url_info if url in ApprenticeSpider.crawled_urls: #already crawled, skip it continue prediction = ApprenticeSpider.apprentice.predict(word_offset_pairs) if prediction.has_key('pos'): potential_interestness = prediction[ 'pos'] #get the potential interest of the url else: potential_interestness = 0 #neg is 1 # print "pi of %s is %f" %(potential_interestness, potential_interestness) priority = int( potential_interestness * 10**3 ) #converting the priority to int to accord with the PriorityQueue spec req = Request( url, priority= priority, #after the request is done, run parse_item to train the apprentice callback=self.parse_item ) #this line is extremely important, or items harvest rate drops drastically req.meta[ 'word_offset_pairs'] = word_offset_pairs #passing additional data to request # req.meta ['train_flag'] = True #we only do training, nothing else yield req
def test_matches(self): url1 = 'http://lotsofstuff.com/stuff1/index' url2 = 'http://evenmorestuff.com/uglystuff/index' lx = SgmlLinkExtractor(allow=(r'stuff1', )) self.assertEqual(lx.matches(url1), True) self.assertEqual(lx.matches(url2), False) lx = SgmlLinkExtractor(deny=(r'uglystuff', )) self.assertEqual(lx.matches(url1), True) self.assertEqual(lx.matches(url2), False) lx = SgmlLinkExtractor(allow_domains=('evenmorestuff.com', )) self.assertEqual(lx.matches(url1), False) self.assertEqual(lx.matches(url2), True) lx = SgmlLinkExtractor(deny_domains=('lotsofstuff.com', )) self.assertEqual(lx.matches(url1), False) self.assertEqual(lx.matches(url2), True) lx = SgmlLinkExtractor(allow=('blah1',), deny=('blah2',), allow_domains=('blah1.com',), deny_domains=('blah2.com',)) self.assertEqual(lx.matches('http://blah1.com/blah1'), True) self.assertEqual(lx.matches('http://blah1.com/blah2'), False) self.assertEqual(lx.matches('http://blah2.com/blah1'), False) self.assertEqual(lx.matches('http://blah2.com/blah2'), False)
class ItunesSpider(CrawlSpider): name = 'itunes' allowed_domains = ['apple.com'] start_urls = ['https://itunes.apple.com/us/genre/ios-games/id6014?mt=8'] rules = ( #Rule(SgmlLinkExtractor(allow=r'Items/'), callback='parse_item', follow=True), Rule(SgmlLinkExtractor(allow=r'us/app/.+'), callback='parse_item', follow=True), ) xmlstring = '<?xml version="1.0" encoding="UTF-8"?><root>' xmlTemplate = """ <item> <image>%(image)s</image> <stars>%(stars)s</stars> <title>%(title)s</title> <category>%(category)s</category> <desc>%(desc)s</desc> <link>%(link)s</link> </item> """ def parse_item(self, response): image = stars = title = category = desc = link = '' hxs = HtmlXPathSelector(response) image = hxs.select( "//div[@id='left-stack']/div/a/div/img/@src").extract()[0] title = hxs.select("//div[@id='title']/div/h1/text()").extract()[0] category = hxs.select("//li[@class='genre']/a/text()").extract()[0] desc = hxs.select("//div[@class='product-review']/p").extract()[0] data = { 'image': image, 'stars': stars, 'title': title, 'category': category, 'desc': desc, 'link': link } self.xmlstring += self.xmlTemplate % data # http://www.suchkultur.de/blog/suchmaschinen/crawler/web-scraping-mit-dem-scrapy-framework/ def parse(self, response): hxs = HtmlXPathSelector(response) sites = hxs.select("//div[@id='selectedcontent']/div/ul/li") i = 0 #image = stars = title = category = desc = link = '' for site in sites: i += 1 #image = site.select("div/div/div/a[contains(@class,'thumbnail')]/img/@src").extract()[0] #stars = ''#site.select("div/div/div/div[contains(@class,'ratings')]/@title").extract() #title = site.select("div/div/div/a[contains(@class,'title')]/text()").extract()[0] #category = site.select("div/div/span[contains(@class,'attribution')]/div/a/text()").extract()[0] #desc = site.select("div/div/p[contains(@class,'snippet-content')]/text()").extract()[0] link = site.select("a/@href").extract()[0] if i == 1: yield Request(link, callback=self.parse_item) # print image,stars,title,category,desc,link,'\n' #print link # print "\n" #data = {'image':image,'stars':stars,'title':title,'category':category,'desc':desc,'link':link} #self.xmlstring += self.xmlTemplate%data self.xmlstring += "<size>" + str(i) + "</size></root>" filename = 'itunes.xml' f = open(filename, 'w') f.write(self.xmlstring)
def test_extraction(self): '''Test the extractor's behaviour among different situations''' lx = SgmlLinkExtractor() self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), ]) lx = SgmlLinkExtractor(allow=('sample', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), ]) lx = SgmlLinkExtractor(allow=('sample', ), unique=False) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'), ]) lx = SgmlLinkExtractor(allow=('sample', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), ]) lx = SgmlLinkExtractor(allow=('sample', ), deny=('3', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), ]) lx = SgmlLinkExtractor(allow_domains=('google.com', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u''), ])
class AvitoSpider(CrawlSpider): ORIGIN_ID = 1 MAX_PAGE = 20 _last_page = {} name = 'avito' allowed_domains = ['avito.ru'] custom_settings = {'DOWNLOAD_DELAY': 3} localities = { 'gelendzhik':LocalityMapper.GELENDZHIK, 'anapa': LocalityMapper.ANAPA, 'novorossiysk':LocalityMapper.NOVOROSSIYSK, 'temryuk':LocalityMapper.TEMRYUK, 'abrau-dyurso': LocalityMapper.ABRAUDYURSO, 'anapskaya': LocalityMapper.ANAPSKAYA, 'arhipo-osipovka': LocalityMapper.ARHIPOOSIPOVKA, 'ahtanizovskaya': LocalityMapper.AHTANIZOVSKAYA, 'verhnebakanskiy': LocalityMapper.VERHNEBAKANSKIY, 'vinogradnyy': LocalityMapper.VINOGRADNYY, 'vityazevo': LocalityMapper.VITYAZEVO, 'vyshesteblievskaya': LocalityMapper.VYSHESTEBLIEVSKAYA, 'gayduk': LocalityMapper.GAYDUK, 'glebovka': LocalityMapper.GLEBOVSKOE, 'golubitskaya': LocalityMapper.GOLUBITSKAYA, 'gostagaevskaya': LocalityMapper.GOSTAGAEVSKAYA, 'kurchanskaya': LocalityMapper.KURCHANSKAYA, 'kabardinka': LocalityMapper.KABARDINKA, 'divnomorskoe': LocalityMapper.DIVNOMORSKOE, 'dzhiginka': LocalityMapper.DZHIGINKA, 'myshako': LocalityMapper.MYSHAKO, 'natuhaevskaya': LocalityMapper.NATUHAEVSKAYA, 'raevskaya': LocalityMapper.RAEVSKAYA, 'yurovka': LocalityMapper.YUROVKA, 'tsibanobalka': LocalityMapper.TSYBANOBALKA, 'taman': LocalityMapper.TAMAN, 'supseh': LocalityMapper.SUPSEH, 'krasnodarskiy_kray_strelka': LocalityMapper.STRELKA, 'starotitarovskaya': LocalityMapper.STAROTITAROVSKAYA, 'sennoy': LocalityMapper.SENNOY, } rules = ( Rule(SgmlLinkExtractor(restrict_xpaths=('//div[@class="pagination__nav clearfix"]/a',)), follow=True, process_request='process_request_filter', callback='process_response_filter'), Rule (SgmlLinkExtractor(restrict_xpaths=('//a[@class="description-title-link"]',), process_value=process_value), callback='parse_item'), ) def start_requests(self): template = "https://www.avito.ru/%s/%s/prodam?user=1&view=list" com_template = "https://www.avito.ru/%s/kommercheskaya_nedvizhimost/prodam/%s/za_vse?user=1&view=list" urls = [] types = ['kvartiry', 'komnaty', 'doma_dachi_kottedzhi', 'zemelnye_uchastki', 'garazhi_i_mashinomesta', ] com_types = ['magazin', 'gostinicy', 'drugoe', 'proizvodstvo', 'sklad', 'ofis'] for l in self.localities.iterkeys(): for t in types: urls.append(template % (l, t)) for com_type in com_types: urls.append(com_template % (l, com_type)) for url in urls: yield Request(url, self.parse) def parse_item(self, response): item = AvitoItem() fields_parser = AvitoFleldsParser(Selector(response), url=response.url, data={'localities': self.localities}) fields_parser.populate_item(item) item.print_item() return item def process_response_filter(self, response): print response.url dates = Selector(response).xpath('//span[@class="date"]/text()') for date in dates: txt = date.extract() key = ur'вчера|сегодня' matches = re.search(key, txt, re.I | re.U) if not matches: page_num = self.get_page_num(response.url) if page_num: self.set_last_page(response.url, int(page_num)) return [] def set_last_page(self, url, value): path = urlparse(url).path self._last_page[path] = value def get_last_page(self, url): path = urlparse(url).path return self._last_page.get(path, self.MAX_PAGE) def get_page_num(self, url): qs = parse_qs(urlparse(url).query) if 'p' in qs: return int(qs['p'][0]) return 0 def process_request_filter(self, request): if self.get_page_num(request.url) > self.get_last_page(request.url): return None return request
class RingspanncorpSpider(CrawlSpider): name = "ringspanncorp" allowed_domains = ["ringspanncorp.com"] items = [] start_urls = ['http://www.ringspanncorp.com/en/products/overview'] rules = ( Rule( SgmlLinkExtractor(), callback='parse_lol', follow=True ), ) def parse_lol(self, response): if response.xpath('//div[@id="product-view"]'): return self.parse_product(response) def parse_product(self, response): hxs = HtmlXPathSelector(response) data = pandas.read_csv("mro/spiders/csv_data/Ringspanncorp/ringspanncorp.csv", sep=',') catalog = list(data.catalog_number) ids = list(data.id) description = list(data.description) key1 = list(data.key1) key2 = list(data.key2) catalog_key1 = dict(zip(catalog, key1)) catalog_key2 = dict(zip(catalog, key2)) catalog_description = dict(zip(catalog, description)) catalog_id = dict(zip(catalog, ids)) key1_ids = dict(zip(key1, ids)) key2_ids = dict(zip(key2, ids)) key1_catalog = dict(zip(key1, catalog)) key2_catalog = dict(zip(key2, catalog)) key1_description = dict(zip(key1, description)) key2_description = dict(zip(key2, description)) for catalog_n in catalog: key = catalog_key1[catalog_n] name = ' ' + str(key) if name in response.xpath('//h1').extract_first(): if catalog_n not in self.items: item = UniversalItem() item['ids'] = catalog_id[catalog_n] item['catalog_number'] = catalog_n key_digits = catalog_key1[catalog_n].re('(\d+)') self.items.append(catalog_n) url = response.xpath('//a[@class="cad link_grey"]/@href') yield Request(url=url, meta={'item': item, 'key': key_digits}, callback='cad_page') for catalog_n in catalog: key = catalog_key2[catalog_n] name = ' ' + str(key) name2 = str(key) + ' ' if name in response.xpath('//h1').extract_first() or name2 in response.xpath('//h1').extract_first(): if catalog_n not in self.items: item = UniversalItem() item['ids'] = catalog_id[catalog_n] item['catalog_number'] = catalog_n key_digits = catalog_key1[catalog_n].re('(\d+)') self.items.append(catalog_n) url = response.xpath('//a[@class="cad link_grey"]/@href') yield Request(url=url, meta={'item': item, 'key': key_digits}, callback='cad_page') def cad_page(self, response): item = response.meta['item'] key_digits = response.meta['key_digits'] data = pandas.read_csv("mro/spiders/csv_data/Ringspanncorp/ringspanncorp.csv", sep=',') catalog = list(data.catalog_number) ids = list(data.id) description = list(data.description) key1 = list(data.key1) key2 = list(data.key2) catalog_key1 = dict(zip(catalog, key1)) catalog_key2 = dict(zip(catalog, key2)) catalog_description = dict(zip(catalog, description)) catalog_id = dict(zip(catalog, ids)) key1_ids = dict(zip(key1, ids)) key2_ids = dict(zip(key2, ids)) key1_catalog = dict(zip(key1, catalog)) key2_catalog = dict(zip(key2, catalog)) key1_description = dict(zip(key1, description)) key2_description = dict(zip(key2, description)) for catalog_n in catalog: key = catalog_key1[catalog_n] name = ' ' + str(key) if name in response.xpath('//h1').extract_first(): if catalog_n not in self.items: item = UniversalItem() item['ids'] = catalog_id[catalog_n] item['catalog_number'] = catalog_n self.items.append(catalog_n) url = response.xpath('//a[@class="cad link_grey"]/@href') yield Request(url=url, callback='request_cad') for catalog_n in catalog: key = catalog_key2[catalog_n] name = ' ' + str(key) name2 = str(key) + ' ' if name in response.xpath('//h1').extract_first() or name2 in response.xpath('//h1').extract_first(): if catalog_n not in self.items: item = UniversalItem() item['ids'] = catalog_id[catalog_n] item['catalog_number'] = catalog_n self.items.append(catalog_n) url = response.xpath('//a[@class="cad link_grey"]/@href') yield Request(url=url, callback='request_cad')
class WikiSpider(CrawlSpider): """Crawls wikipedia starting at the seed page. Rate limited by obeying robots.txt (see settings.py), autothrottle, and default delay of 1.1 $ cd nlp/wikiscrapy $ scrapy crawl wiki -o wikipedia_erdos.json -t json >>> import subprocess >>> subprocess.check_output('scrapy', 'crawl', 'wiki', stderr=subprocess.STDOUT) """ verbosity = 1 name = 'wiki' download_delay = 1.1 allowed_domains = ['en.wikipedia.org', 'en.wiktionary.org'] # , 'en.m.wikipedia.org'] start_urls = ['''https://en.wikipedia.org/wiki/Paul_Erd%C5%91s'''] rules = [ Rule(SgmlLinkExtractor(allow=['/wiki/.+']), follow=True, process_links='filter_links', callback='parse_response'), #Rule(SgmlLinkExtractor(allow=['/wiki/.*']), 'parse_response')] ] def __init__(self, start_urls=None, *args, **kwargs): self.start_urls = ['''https://en.wikipedia.org/wiki/Paul_Erd%C5%91s'''] if start_urls: if isinstance(start_urls, basestring): self.start_urls = [start_urls] else: self.start_urls = list(start_urls) super(WikiSpider, self).__init__(*args, **kwargs) def clean_list(self, l): ans = [''] for item in l: # TODO: regex to see if it's a number of the form 1.2.3 before creating a new line item # and use the section number as a key or value in a dictionary stripped = item.strip() if stripped: ans[-1] += stripped + ' ' if item.endswith('\n'): ans[-1] = ans[-1].strip() ans += [''] return ans def filter_links(self, links): filtered_list = [] for link in links: if not RE_WIKIPEDIA_SPECIAL.match(link.url): filtered_list += [link] if self.verbosity > 1: print '-' * 20 + ' LINKS ' + '-' * 20 print '\n'.join(link.url for link in filtered_list) # sleep(1.1) if self.verbosity > 1: print '-' * 20 + '-------' + '-' * 20 return filtered_list def parse_response(self, response): # TODO: # 1. check for error pages and slowdown or halt crawling # 2. throttle based on robots.txt # 3. save to database (so that json doesn't have to be loaded manually) # 4. use django Models rather than scrapy.Item model # 5. incorporate into a django app (or make it a django app configurable through a web interface) # 6. incrementally build occurrence matrix rather than saving raw data to django/postgres db if self.verbosity > 1: print '=' * 20 + ' PARSE ' + '=' * 20 sel = Selector(response) a = WikiItem() a['url'] = response.url a['title'] = ' '.join( sel.xpath("//h1[@id='firstHeading']//text()").extract()) a['toc'] = ' '.join( self.clean_list( sel.xpath("//div[@id='toc']//ul//text()").extract())) a['text'] = ' '.join( sel.xpath('//div[@id="mw-content-text"]//text()').extract()) a['modified'] = clean_wiki_datetime( sel.xpath('//li[@id="footer-info-lastmod"]/text()').re( r'([0-9]+\s*\w*)')) a['crawled'] = datetime.now() a['count'] = dict(Counter(get_words(a['text']))) if self.verbosity > 1: print '=' * 20 + '=======' + '=' * 20 yield a
class ZzkSpider(CrawlSpider): name = "zzk" allowed_domains = [ "39.net", ] start_urls = [ "http://jbk.39.net/bw_t2", #zhengzhuang home page ] rules = (Rule(SgmlLinkExtractor(allow=(r'http://jbk.39.net/*')), follow=True, callback="parse_item"), ) def parse_item(self, response): hxs = Selector(response) div_xpath = hxs.xpath( '//section[contains(@class, "main wrap")]/div[contains(@class, "list_con clearfix")]' ) symptoms_tag = hxs.xpath( '//section[contains(@class, "main wrap")]/header[contains(@class, "list_tit")]/h1/cite[contains(@class, "bg2")]/text()' ).extract() sheet_name = hxs.xpath( '//section[contains(@class, "main wrap")]/header[contains(@class, "list_tit")]//div[contains(@id, "list_nav")]//li[contains(@class, "h")]/text()' ).extract() #如果标签不是症状,或没有标签则返回 if (cmp(str(symptoms_tag), "[u'\u75c7\u72b6']") == 0) and (cmp( str(sheet_name), "[u'\u7efc\u8ff0']") == 0): item = ZzkItem() item["url"] = response.url symptoms_name = hxs.xpath( '//section[contains(@class, "main wrap")]/header[contains(@class, "list_tit")]/h1/b/text()' ).extract() item["symptoms_name"] = symptoms_name symptoms_description = hxs.xpath( '//section[contains(@class, "main wrap")]/div[contains(@class, "list_con clearfix")]//dd[contains(@id, "intro")]/p[contains(@class, "sort2")]/text()' ).extract() item["symptoms_description"] = symptoms_description disease_info_list_path = div_xpath.xpath( '//div[contains(@class, "item")]//table[contains(@class, "dis")]/tr' ) if not disease_info_list_path: return disease_item_list = [] for disease_info in disease_info_list_path: class_name = disease_info.xpath('.//@class').extract() if cmp(str(class_name), "[u'name']") == 0: disease_info_xpath = disease_info.xpath('./td') disease_list = [] disease_url = disease_info.xpath( './td[contains(@class, "name")]/a/@href').extract() for disease in disease_info_xpath: disease_name = disease.xpath('./a/@title').extract() disease_list.append(disease_name) disease_item = DiseaseItem() disease_item["disease_name"] = disease_list[0] disease_item["symptoms"] = disease_list[1] disease_item["department"] = disease_list[2] disease_item["disease_url"] = disease_url disease_item_list.append(disease_item) item["disease"] = disease_item_list return item
class IfengSpider(CrawlSpider): name = "stock_ifeng" szIndexUrl = [ "http://api.finance.ifeng.com/index.php/akdaily/?code=sh000001&tpye=fq", "http://api.finance.ifeng.com/index.php/akdaily/?code=sz399001&tpye=fq" ] allowed_domains = [ "bestgo.com", "api.finance.ifeng.com", "app.finance.ifeng.com", ] start_urls = [] day_str = [] for day_span in range(1, 19, 5): day_str.append( (date.today() - timedelta(days=day_span)).strftime('%Y%m%d')) for page_id in range(1, 32): for day in day_str: url = "http://www.bestgo.com/fund/SH/%s/1,%d.html" % (day, page_id) start_urls.append(url) url = "http://www.bestgo.com/fund/%s/1,%d.html" % (day, page_id) start_urls.append(url) for idx in [1, 2, 3, 5, 7]: url = "http://app.finance.ifeng.com/list/all_stock_cate.php?s=%d" % idx start_urls.append(url) rules = [ Rule(SgmlLinkExtractor( allow=(r'http://www.bestgo.com/fund/.*\d+/1.\d+\.html')), follow=True, callback="parse_item"), Rule(SgmlLinkExtractor(allow=( r'http://app.finance.ifeng.com/list/stock_cate.php\?c=\d+$')), follow=True, callback="parse_cate"), ] def parse_item(self, response): for url in self.szIndexUrl: yield scrapy.Request(url, self.parse_api) hxs = Selector(response) stocks = hxs.xpath('//div[@class="grid-view"]/table/tbody/tr') self.log(str(len(stocks))) for stock in stocks: stock_id = stock.xpath('./td/a/span/text()') if len(stock_id) < 1: continue stock_code = 'sz' if 'SH' in response.url: stock_code = 'sh' stock_code = stock_code + stock_id[0].extract() self.log('[' + response.url + "]:[" + stock_code + ']', level=scrapy.log.DEBUG) api_url = "http://api.finance.ifeng.com/index.php/akdaily/?code=%s&tpye=fq" % ( stock_code) yield scrapy.Request(api_url, self.parse_api) def parse_api(self, response): stock_code = None params = response.url.split('?')[-1].split('&') for param in params: [k, v] = param.split('=') if k == 'code': stock_code = v if stock_code is None: self.log("no stock_id found in [" + response.url + "]", scrapy.log.CRITICAL) data = json.loads(response.body) records = data["record"] for record in records: self.log("stock history:" + stock_code + ":" + ",".join(record)) item = StockItem() item["code"] = stock_code item["records"] = records item['cate'] = 'ifeng' return item def parse_cate(self, response): stock_cate = None params = response.url.split('?')[-1].split('&') for param in params: [k, v] = param.split('=') if k == 'c': stock_cate = v stock_code_list = [] hxs = Selector(response) stock_codes = hxs.xpath('//table/tr/td[1]/a').extract() self.log("[cate_code]" + str(len(stock_codes))) for stock_code in stock_codes: items = stock_code.split('/') if len(items) < 6: continue code = items[6] self.log(code) stock_code_list.append(code) item = StockItem() item['code'] = stock_cate item['records'] = stock_code_list item['cate'] = "cate" return item
class CraigslistSpider(CrawlSpider): name = "craigslist" allowed_domains = ["craigslist.org"] start_urls = [ "http://lascruces.craigslist.org", "http://lascruces.craigslist.org/cas/", "http://lascruces.craigslist.org/cas/index100.html", "http://lascruces.craigslist.org/cas/index200.html", "http://lascruces.craigslist.org/cas/index300.html", "http://lascruces.craigslist.org/cas/index400.html", "http://lascruces.craigslist.org/cas/index500.html", "http://lascruces.craigslist.org/cas/index600.html", "http://lascruces.craigslist.org/cas/index700.html", "http://lascruces.craigslist.org/cas/index800.html", "http://lascruces.craigslist.org/cas/index900.html", "http://lascruces.craigslist.org/cas/index1000.html", "http://lascruces.craigslist.org/cas/index1100.html", "http://lascruces.craigslist.org/cas/index1200.html", "http://lascruces.craigslist.org/cas/index1300.html", "http://lascruces.craigslist.org/cas/index1400.html", "http://lascruces.craigslist.org/cas/index1500.html", "http://lascruces.craigslist.org/cas/index1600.html", "http://lascruces.craigslist.org/cas/index1700.html", "http://lascruces.craigslist.org/cas/index1800.html", "http://lascruces.craigslist.org/cas/index1900.html", "http://lascruces.craigslist.org/cas/index2000.html", "http://lascruces.craigslist.org/cas/index2100.html", "http://lascruces.craigslist.org/cas/index2200.html", "http://lascruces.craigslist.org/cas/index2300.html", "http://lascruces.craigslist.org/cas/index2400.html", "http://lascruces.craigslist.org/cas/index2500.html", "http://lascruces.craigslist.org/cas/index2600.html", "http://lascruces.craigslist.org/cas/index2700.html", "http://lascruces.craigslist.org/cas/index2800.html", "http://lascruces.craigslist.org/cas/index2900.html", "http://lascruces.craigslist.org/cas/index3000.html", "http://lascruces.craigslist.org/cas/index3100.html", "http://lascruces.craigslist.org/cas/index3200.html", "http://lascruces.craigslist.org/cas/index3300.html", "http://lascruces.craigslist.org/cas/index3400.html", "http://lascruces.craigslist.org/cas/index3500.html", "http://lascruces.craigslist.org/cas/index3600.html", "http://lascruces.craigslist.org/cas/index3700.html", "http://lascruces.craigslist.org/cas/index3800.html", "http://lascruces.craigslist.org/cas/index3900.html", "http://lascruces.craigslist.org/cas/index4000.html", "http://lascruces.craigslist.org/cas/index4100.html", "http://lascruces.craigslist.org/cas/index4200.html", "http://lascruces.craigslist.org/cas/index4300.html", "http://lascruces.craigslist.org/cas/index4400.html", "http://lascruces.craigslist.org/cas/index4500.html", "http://lascruces.craigslist.org/cas/index4600.html", "http://lascruces.craigslist.org/cas/index4700.html", "http://lascruces.craigslist.org/cas/index4800.html", "http://lascruces.craigslist.org/cas/index4900.html", "http://lascruces.craigslist.org/cas/index5000.html", "http://lascruces.craigslist.org/cas/index5100.html", "http://lascruces.craigslist.org/cas/index5200.html", "http://lascruces.craigslist.org/cas/index5300.html", "http://lascruces.craigslist.org/cas/index5400.html", "http://lascruces.craigslist.org/cas/index5500.html", "http://lascruces.craigslist.org/cas/index5600.html", "http://lascruces.craigslist.org/cas/index5700.html", "http://lascruces.craigslist.org/cas/index5800.html", "http://lascruces.craigslist.org/cas/index5900.html", "http://lascruces.craigslist.org/cas/index6000.html", "http://lascruces.craigslist.org/cas/index6100.html", "http://lascruces.craigslist.org/cas/index6200.html", "http://lascruces.craigslist.org/cas/index6300.html", "http://lascruces.craigslist.org/cas/index6400.html", "http://lascruces.craigslist.org/cas/index6500.html", "http://lascruces.craigslist.org/cas/index6600.html", "http://lascruces.craigslist.org/cas/index6700.html", "http://lascruces.craigslist.org/cas/index6800.html", "http://lascruces.craigslist.org/cas/index6900.html", "http://lascruces.craigslist.org/cas/index7000.html", "http://lascruces.craigslist.org/cas/index7100.html", "http://lascruces.craigslist.org/cas/index7200.html", "http://lascruces.craigslist.org/cas/index7300.html", "http://lascruces.craigslist.org/cas/index7400.html", "http://lascruces.craigslist.org/cas/index7500.html", "http://lascruces.craigslist.org/cas/index7600.html", "http://lascruces.craigslist.org/cas/index7700.html", "http://lascruces.craigslist.org/cas/index7800.html", "http://lascruces.craigslist.org/cas/index7900.html", "http://lascruces.craigslist.org/cas/index8000.html", "http://lascruces.craigslist.org/cas/index8100.html", "http://lascruces.craigslist.org/cas/index8200.html", "http://lascruces.craigslist.org/cas/index8300.html", "http://lascruces.craigslist.org/cas/index8400.html", "http://lascruces.craigslist.org/cas/index8500.html", "http://lascruces.craigslist.org/cas/index8600.html", "http://lascruces.craigslist.org/cas/index8700.html", "http://lascruces.craigslist.org/cas/index8800.html", "http://lascruces.craigslist.org/cas/index8900.html", "http://lascruces.craigslist.org/cas/index9000.html", "http://lascruces.craigslist.org/cas/index9100.html", "http://lascruces.craigslist.org/cas/index9200.html", "http://lascruces.craigslist.org/cas/index9300.html", "http://lascruces.craigslist.org/cas/index9400.html", "http://lascruces.craigslist.org/cas/index9500.html", "http://lascruces.craigslist.org/cas/index9600.html", "http://lascruces.craigslist.org/cas/index9700.html", "http://lascruces.craigslist.org/cas/index9800.html", "http://lascruces.craigslist.org/cas/index9900.html" ] rules = (Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a')), callback="parse", follow=True), ) def parse(self, response): hxs = HtmlXPathSelector(response) titles = hxs.select("//span[@class='pl']") date_info = hxs.select( "//h4[@class='ban']/span[@class='bantext']/text()") items = [] file_to = open("things.txt", "a") file_to.write(response.body) for titles in titles: item = CraigslistSampleItem() item["title"] = titles.select("a/text()").extract() item["link"] = titles.select("a/@href").extract() item["date"] = date_info.extract() items.append(item) return items
class MSSpider(CrawlSpider): name = "ms" allowed_domains = [] f = open('/home/wyp/pub-owl-collector/ms_url.json', mode='r') d = json.loads(f.read()) #print d #start_urls = [conf['link'] for conf in d] f.close() start_urls = [ "http://academic.research.microsoft.com/Detail?entitytype=3&searchtype=2&id=69" ] rules = ( Rule(SgmlLinkExtractor(allow=('Publication/')), callback='parse_item'), Rule(SgmlLinkExtractor(restrict_xpaths=('//a[text()="Next"]')), follow=True), ) def parse_item(self, response): hxs = HtmlXPathSelector(response) item = PapersItem() item['title'] = hxs.select( '//span[@id="ctl00_MainContent_PaperItem_title" and @class="title-span"]/text()' ).extract() item['fulltext'] = hxs.select( '//*[@id="ctl00_MainContent_PaperItem_downLoadList_ctl00_HyperLink2"]/@href' ).extract() item['description'] = [ ''.join( hxs.select( '//*[@id="ctl00_MainContent_PaperItem_snippet"]/text()'). extract()) ] item['conference'] = hxs.select( '//*[@id="ctl00_MainContent_PaperItem_HLConference"]/text()' ).extract() patt2 = re.compile(r'^\W+(.*)$') item['year'] = [ re.search(patt2, (hxs.select( '//*[@id="ctl00_MainContent_PaperItem_YearConference"]/text()' ).extract())[0]).group(1) ] nodes = hxs.select( '//div[@id="ctl00_MainContent_PaperItem_divPaper"]/div/a[@class="author-name-tooltip"]' ) patt = re.compile('\/(\d+)\/') opener = urllib2.build_opener() author = [] affiliation = [] for node in nodes: author.append(node.select('text()').extract()[0]) authorid = re.search(patt, node.select('@href').extract()[0]).group(1) f = opener.open( "http://academic.research.microsoft.com/io.ashx?authorID=%s" % authorid).read() d = json.loads(f) affiliation.append(d['Affiliation'] and d['Affiliation']['FullName']) item['author'] = author item['affiliation'] = affiliation return item
def test_urls_type(self): '''Test that the resulting urls are regular strings and not a unicode objects''' lx = SgmlLinkExtractor() self.assertTrue(all(isinstance(link.url, str) for link in lx.extract_links(self.response)))
class GroupSpider(CrawlSpider): name = "Group222" allowed_domains = ["douban.com"] start_urls = [ "http://www.douban.com/group/explore?tag=%E8%B4%AD%E7%89%A9", "http://www.douban.com/group/explore?tag=%E7%94%9F%E6%B4%BB", "http://www.douban.com/group/explore?tag=%E7%A4%BE%E4%BC%9A", "http://www.douban.com/group/explore?tag=%E8%89%BA%E6%9C%AF", "http://www.douban.com/group/explore?tag=%E5%AD%A6%E6%9C%AF", "http://www.douban.com/group/explore?tag=%E6%83%85%E6%84%9F", "http://www.douban.com/group/explore?tag=%E9%97%B2%E8%81%8A", "http://www.douban.com/group/explore?tag=%E5%85%B4%E8%B6%A3" ] rules = [ Rule(SgmlLinkExtractor(allow=('/group/[^/]+/$', )), callback='parse_group_home_page', process_request='add_cookie'), # Rule(SgmlLinkExtractor(allow=('/group/[^/]+/discussion\?start\=(\d{1,4})$', )), callback='parse_group_topic_list', process_request='add_cookie'), Rule(SgmlLinkExtractor(allow=('/group/explore\?tag', )), follow=True, process_request='add_cookie'), ] def __get_id_from_group_url(self, url): m = re.search("^http://www.douban.com/group/([^/]+)/$", url) if (m): return m.group(1) else: return 0 def add_cookie(self, request): request.replace(cookies=[]) return request def parse_group_topic_list(self, response): self.log("Fetch group topic list page: %s" % response.url) pass def parse_group_home_page(self, response): self.log("Fetch group home page: %s" % response.url) hxs = HtmlXPathSelector(response) item = DoubanItem() #get group name item['groupName'] = hxs.select('//h1/text()').re("^\s+(.*)\s+$")[0] #get group id item['groupURL'] = response.url groupid = self.__get_id_from_group_url(response.url) #get group members number members_url = "http://www.douban.com/group/%s/members" % groupid members_text = hxs.select('//a[contains(@href, "%s")]/text()' % members_url).re("\((\d+)\)") item['totalNumber'] = members_text[0] #get relative groups item['RelativeGroups'] = [] groups = hxs.select('//div[contains(@class, "group-list-item")]') for group in groups: url = group.select( 'div[contains(@class, "title")]/a/@href').extract()[0] item['RelativeGroups'].append(url) #item['RelativeGroups'] = ','.join(relative_groups) return item
class GrabberSpider(CrawlSpider): name = "grabber" allowed_domains = [] # Let's think how to pass here url start_urls = [] rules = [ Rule(SgmlLinkExtractor(), callback='parse_item', follow=True), Rule(SgmlLinkExtractor(allow=[r'.*\.css'], deny_extensions=[], tags=[ 'link', ], attrs=[ 'href', ]), callback='parse_css_item', follow=False), ] def check_local_domain_uniqueness(self, local_domain): q = self.dbsession.query(WebSite)\ .filter(WebSite.local_domain == local_domain) check = q.first() if check and check.original_url not in self.allowed_domains: return False return True def __init__(self, *args, **kw): # get extra parameters of scraper launch cmd SCRAPED_DOMAIN = kw.pop('SCRAPED_DOMAIN', None) START_URL = kw.pop('START_URL', None) LOCAL_DOMAIN = kw.pop('LOCAL_DOMAIN', None) if START_URL: self.start_urls = [START_URL] if SCRAPED_DOMAIN: if SCRAPED_DOMAIN.strip('.')[0] == 'www': self.allowed_domains = [ SCRAPED_DOMAIN, SCRAPED_DOMAIN.replace('www.', '') ] else: self.allowed_domains = [ SCRAPED_DOMAIN, 'www.%s' % SCRAPED_DOMAIN ] if LOCAL_DOMAIN is None: raise GrabberSpiderError('No local_url is specified for job') super(GrabberSpider, self).__init__(*args, **kw) log.msg('Init SQL alchemy engine', level=log.DEBUG) engine = engine_from_config(WEB_APP_SETTINGS, 'sqlalchemy.') conn = engine.connect() self.dbsession = Session(bind=conn) # patch orm objects to use this local session object Base.metadata.create_all(engine) # while use creating DB here if not self.check_local_domain_uniqueness(LOCAL_DOMAIN): raise GrabberSpiderError('%s is already used in db') q = self.dbsession.query(WebSite)\ .filter(WebSite.original_url == SCRAPED_DOMAIN) website = q.first() if website is None: website = WebSite(original_url=SCRAPED_DOMAIN, local_domain=LOCAL_DOMAIN) self.dbsession.add(website) self.dbsession.commit() self.website = website ''' Check directory for media and create it if it does not exist ''' media_dir = WEB_APP_SETTINGS.get('downloaded.path') if media_dir: if not os.path.exists(media_dir): os.mkdir(media_dir) else: raise Exception( 'Directory for downloaded media is not specified in settings') ''' Check id downloaded media url is in application settings ''' if WEB_APP_SETTINGS.get('downloaded.url') is None: raise Exception('URL for downloaded media is not specified') def prepare_link(self, url, current_url): ''' Make proper uri from given url ''' if not current_url.endswith('/'): current_url += '/' # ignore javascript links for s in ['javascript:', 'mailto:', '#']: if url.startswith(s): return None if url.find('http://') != -1 or url.find('https://') != -1: # in case we have complete http or https protocol uri url = url.replace('http://', '').replace('https://', '') url_domain = url.split('/')[0] # Scrape just this site urls if url_domain == self.website.original_url: return url else: return None # don't scrape external links if url.startswith('/'): # we get absolute url return "http://%s%s" % (self.website.original_url, url) else: return "%s%s" % (current_url, url) def parse_item(self, response): log.msg('I\'m here: %s' % response.url, level=log.DEBUG) return self.handle_page(response) def parse_css_item(self, response): log.msg('I\'m here: %s' % response.url, level=log.DEBUG) return self.handle_page(response, css=True) def _get_path(self, url): path = url.replace('http://', '') path = path.split('/') path[0] = '' path = '/'.join(path) return urllib.unquote_plus(path) def handle_page(self, response, css=False): path = self._get_path(response.url) content = response.body.decode(response.encoding) item = WebPageItem(uri=path, content=content, css=css, response=response) return item
class AppstoreSpider(CrawlSpider): name = 'appstore' allowed_domains = ['itunes.apple.com'] start_urls = [ 'http://itunes.apple.com/us/genre/ios-books/id6018?mt=8', 'http://itunes.apple.com/us/genre/ios-business/id6000?mt=8', 'http://itunes.apple.com/us/genre/ios-catalogs/id6022?mt=8', 'http://itunes.apple.com/us/genre/ios-education/id6017?mt=8', 'http://itunes.apple.com/us/genre/ios-entertainment/id6016?mt=8', 'http://itunes.apple.com/us/genre/ios-finance/id6015?mt=8', 'http://itunes.apple.com/us/genre/ios-food-drink/id6023?mt=8', 'http://itunes.apple.com/us/genre/ios-games/id6014?mt=8', 'http://itunes.apple.com/us/genre/ios-health-fitness/id6013?mt=8', 'http://itunes.apple.com/us/genre/ios-lifestyle/id6012?mt=8', 'http://itunes.apple.com/us/genre/ios-medical/id6020?mt=8', 'http://itunes.apple.com/us/genre/ios-music/id6011?mt=8', 'http://itunes.apple.com/us/genre/ios-navigation/id6010?mt=8', 'http://itunes.apple.com/us/genre/ios-news/id6009?mt=8', 'http://itunes.apple.com/us/genre/ios-newsstand/id6021?mt=8', 'http://itunes.apple.com/us/genre/ios-photo-video/id6008?mt=8', 'http://itunes.apple.com/us/genre/ios-productivity/id6007?mt=8', 'http://itunes.apple.com/us/genre/ios-reference/id6006?mt=8', 'http://itunes.apple.com/us/genre/ios-social-networking/id6005?mt=8', 'http://itunes.apple.com/us/genre/ios-sports/id6004?mt=8', 'http://itunes.apple.com/us/genre/ios-travel/id6003?mt=8', 'http://itunes.apple.com/us/genre/ios-utilities/id6002?mt=8', 'http://itunes.apple.com/us/genre/ios-weather/id6001?mt=8', ] rules = ( Rule(SgmlLinkExtractor(allow='letter=[\w\*]+'), follow=True, callback="parse_applist"), Rule(SgmlLinkExtractor(allow='letter=[\w\*]+\&page=[\d]+'), follow=True, callback="parse_applist"), ) def parse(self, response): r = list(CrawlSpider.parse(self, response)) return r + list(self.parse_applist(response)) def parse_applist(self, response): #parse_applist hxs = HtmlXPathSelector(response) category = hxs.select('//title/text()').extract()[0].split( '-')[0].strip() idx = 0 for url, name in zip( hxs.select('//div[contains(@class,"column")]/ul/li/a/@href' ).extract(), hxs.select('//div[contains(@class,"column")]/ul/li/a/text()'). extract()): if not '/app/' in url: continue i = AppItem() i['name'] = name i['url'] = url i['id'] = url.split('/')[-1].split('?')[0] i['category'] = category i['last_update'] = datetime.date.today().isoformat() i['store'] = 'appstore' idx += 1 yield i def parse_app(self, response): #parse_app hxs = HtmlXPathSelector(response) i = AppStoreItem() i['name'] = hxs.select('//div/div/h1/text()').extract()[0] i['url'] = response.url i['id'] = response.url.split('/')[-1].split('?')[0] attrs = hxs.select('//div[@id="content"]') i['description'] = "\n".join( attrs.select('//div[@class="product-review"]/p/text()').extract()) i['artwork'] = attrs.select( '//div[@class="lockup product application"]/a/div/img/@src' ).extract() i['price'] = attrs.select('//div[@class="price"]/text()').extract()[0] i['release_date'] = attrs.select( '//li[@class="release-date"]/text()').extract()[0] release_date, version, size, languages, seller, copyright = tuple( attrs.select('//li/text()').extract())[ 0:6] #hugely unsafe but that's how we roll i['release_date'] = release_date i['version'] = version i['size'] = size i['languages'] = languages i['seller'] = seller seller_link = hxs.select('//div[@class="app-links"]/a/@href').extract() if len(seller_link) > 1: i['seller_link'] = seller_link[0] else: i['seller_link'] = '' i['copyright'] = copyright i['rating'] = attrs.select( '//a[@href="http://itunes.apple.com/WebObjects/MZStore.woa/wa/appRatings"]/text()' ).extract()[0] try: requirements = attrs.select( '//div[@class="lockup product application"]/p/text()').extract( )[0] except: requirements = '' i['requirements'] = requirements i['reviews'] = '' #todo i['screenshots'] = "|".join( hxs.select( '//div[@class="swoosh lockup-container application large screenshots"]//img/@src' ).extract()) i['is_iphone'] = 'iPhone' in requirements i['is_ipad'] = 'iPad' in requirements i['is_ipod'] = 'iPod' in requirements i['last_update'] = datetime.date.today().isoformat() i['store'] = 'appstore' yield i
class LegalActsSpider(ManoSeimasSpider): name = 'legal-acts' allowed_domains = ['lrs.lt'] # p_drus - document type # p_kalb_id - language # p_rus - order by # p_gal - document status start_urls = [ # # Current versions # ('http://www3.lrs.lt/pls/inter3/dokpaieska.rezult_l?' # 'p_drus=102&p_kalb_id=1&p_rus=1&p_gal=33'), # Legislation ('http://www3.lrs.lt/pls/inter3/dokpaieska.rezult_l?' 'p_drus=1&p_kalb_id=1&p_rus=1&p_gal='), # # Law drafts # ('http://www3.lrs.lt/pls/inter3/dokpaieska.rezult_l?' # 'p_drus=2&p_kalb_id=1&p_rus=1&p_gal='), # Constitution ('http://www3.lrs.lt/pls/inter3/dokpaieska.rezult_l?' 'p_drus=8&p_kalb_id=1&p_rus=1&p_gal='), #'http://www3.lrs.lt/pls/inter3/dokpaieska.rezult_l?p_nr=&p_nuo=2010%2006%2001&p_iki=&p_org=&p_drus=2&p_kalb_id=1&p_title=&p_text=&p_pub=&p_met=&p_lnr=&p_denr=&p_es=0&p_rus=1&p_tid=&p_tkid=&p_t=0&p_tr1=2&p_tr2=2&p_gal=', ] rules = ( Rule(SgmlLinkExtractor(allow=r'dokpaieska.rezult_l\?')), Rule(SgmlLinkExtractor(allow=r'dokpaieska.susije_l\?p_id=-?\d+$'), 'parse_related_documents'), Rule( SgmlLinkExtractor(allow=r'dokpaieska.showdoc_l\?p_id=-?\d+.*', deny=r'p_daug=[1-9]'), 'parse_document'), ) pipelines = (pipelines.ManoseimasPipeline, ) def _fix_name_case(self, act): name = act.get_output_value('name') for idx, words in ( (-2, (u'ĮSTATYMO PROJEKTAS')), (-1, (u'ĮSTATYMAS')), ): a, b = split_by_words(name, idx) if b in words: act.replace_value('name', '%s %s' % (a, b.lower())) return def _parse_law_act(self, response, hxs, base=False): """ Extracts basic document information and returns law act loader. Parameters: base Return only base information about document. This is used, when filling some information bits to a law act from several law act documents. """ lang = hxs.select('tr[1]/td[4]/b/text()').extract()[0].strip().lower() if lang not in (u'lietuvių', u'rusų', u'anglų', u'ispanų'): self.error(response, 'Unknown language: %s' % lang) if lang != u'lietuvių': return None act = Loader(self, response, LegalAct(), hxs, required=REQUIRED_FIELDS) act.add_xpath('_id', 'tr[1]/td[2]/b/text()') source = self._get_source(response.url, 'p_id') if not act.get_output_value('_id'): act.replace_value('_id', u'NONUMBER-%s' % source['id']) if base: return act act.add_xpath('name', 'caption/text()') act.add_xpath('kind', 'tr[1]/td[1]/b/text()') act.add_xpath('number', 'tr[1]/td[2]/b/text()') act.add_xpath('date', 'tr[1]/td[3]/b/text()') act.add_value('source', source) self._fix_name_case(act) return act def _involved_parts(self, response, hxs, act): involved_string = hxs.select('tr[3]/td[1]/b/text()').extract() involved_string = ' '.join(involved_string) if not involved_string: return None m = DOCUMENT_INVOLVED_PARTS.match(involved_string) if not m: return None involved = Loader(self, response, DocumentInvolved(), hxs, required=( 'date', 'how', 'institution', )) involved.add_value('date', m.group(1)) involved.add_value('how', m.group(2).lower()) institution = m.group(3) if ',' in institution: # TODO: move this to utility function, same code is also used # in manoseimas/scrapy/spiders/mps.py:171 spl = institution.replace(u'Švietimo, mokslo', u'Švietimo%2c mokslo') spl = map(lambda x: urllib.unquote(x.strip()), spl.split(',')) spl = filter(None, spl) if len(spl) == 2: person, institution = spl else: person, group, institution = spl spl = group.strip().split() group_types = (u'komitetas', u'grupė', u'frakcija', u'komisija') if spl[-1].lower() in group_types: group_type = spl[-1].lower() elif spl[0].lower() in group_types: group_type = spl[0].lower() else: group_type = None if group_type: involved.add_value('group', group) involved.add_value('group_type', group_type) else: self.error(response, 'Not committee: %s' % group) involved.add_value('person', person) involved.add_value('institution', institution) act.add_value('involved', dict(involved.load_item())) def _extract_html_as_attachment(self, response, loader, xpath, name): text = HtmlXPathSelector(response).select(xpath).extract() text = clean_html('\n'.join(text)) body = text.encode('utf-8') loader.add_value('_attachments', [(name, body, 'text/html')]) def _get_legislation_links(self, response, hxs): for link in hxs.select('tr[4]/td/a'): text = get_first(link, 'text()') if text == u'Susiję dokumentai': url = get_absolute_url(response, get_first(link, '@href')) yield Request(url, callback=self.parse_related_documents) def _legislation(self, response, hxs): act = self._parse_law_act(response, hxs) if not act: raise StopIteration self._involved_parts(response, hxs, act) self._extract_html_as_attachment( response, act, "/html/body/*[name()='div' or name()='pre']", 'original_version.html') act.reset_required(*(REQUIRED_FIELDS + ('_attachments', ))) yield act.load_item() for request in self._get_legislation_links(response, hxs): yield request def _current_edition(self, response, hxs): # Do not collect documents, if they are not currently valid. valid_edition = hxs.select('tr[4]/td[1]/a[2]/font/b/text()') if (valid_edition and valid_edition.extract()[0] == u'Galiojanti aktuali redakcija'): raise StopIteration act = self._parse_law_act(response, hxs, base=True) if act: self._extract_html_as_attachment( response, act, "/html/body/*[name()='div' or name()='pre']", 'updated_version.html') act.reset_required('_id', '_attachments') yield act.load_item() def parse_document(self, response): # Some thimes lrs.lt returns empty page... if not response.body: return xpath = '/html/body/table[2]' hxs = HtmlXPathSelector(response).select(xpath)[0] # Get document kind kind = hxs.select('tr[1]/td[1]/b/text()').extract()[0].strip().lower() if kind in (u'konstitucija', u'įstatymas', u'įstatymo projektas', u'kodeksas'): items = self._legislation(response, hxs) elif kind == u'aktuali redakcija': items = self._current_edition(response, hxs) else: items = [] for item in items: yield item def parse_related_documents(self, response): xpath = '/html/body/div/table/tr[3]/td/table/tr/td/table/tr' hxs = HtmlXPathSelector(response).select(xpath) act = Loader(self, response, LegalAct(), hxs, required=('_id', )) act.add_xpath('_id', 'td[2]/b/text()') if not act.get_output_value('_id'): p_id = unicode(self._get_query_attr(response.url, 'p_id')) act.replace_value('_id', u'NONUMBER-%s' % p_id) relations = defaultdict(list) xpath = '/html/body/div/table/tr[3]/td/table/tr/td/align/table/tr' for row in HtmlXPathSelector(response).select(xpath): docid = get_all(row, 'td[4]/span//text()') rel_type = row.select('td[6]/span/text()') if rel_type: rel_type = rel_type.extract()[0].strip().lower() if rel_type in (u'pakeistas dokumentas', u'ankstesnė dokumento redakcija'): relations['amends'].append(docid) elif rel_type == u'priimtas dokumentas': relations['adopts'].append(docid) elif rel_type == u'ryšys su taikymą nusakančiu dokumentu': relations['defines_applicability'].append(docid) elif rel_type == u'ryšys su galiojimą nusakančiu dokumentu': relations['defines_validity'].append(docid) elif rel_type == u'negalioja de jure': relations['defines_as_no_longer_valid'].append(docid) elif rel_type == u'kitas projekto variantas': relations['new_draft_version'].append(docid) elif rel_type == u'kitas projekto variantas': relations['new_draft_version'].append(docid) elif rel_type == u'ryšys su ratifikavimo dokumentu': relations['ratification'].append(docid) if relations: act.add_value('relations', dict(relations)) yield act.load_item() def _find_related_law(self, db, doc): keyword = u' įstatymo ' if 'name' not in doc or keyword not in doc['name']: return False name = doc['name'].split(keyword, 2)[0] + u' įstatymas' rs = db.view('scrapy/by_name', key=name, include_docs=True) if len(rs) > 0: doc.setdefault('relations', {})['law'] = [rs.rows[0]['id']] return True def _set_type(self, db, doc): if 'kind' not in doc: return False if (doc['kind'] in (u'įstatymas', u'konstitucija') and not doc.get('relations')): doc['type'] = u'įstatymas' return True elif doc['kind'] == u'įstatymas': doc['type'] = u'įstatymo pataisa' return True def post_process(self, db, started): #for row in db['legalact'].view('scrapy/by_update_time', # startkey=started, include_docs=True): for row in db['legalact'].view('_all_docs', include_docs=True): doc = row.doc changed = False for fn in (self._set_type, self._find_related_law): changed = fn(db['legalact'], doc) or changed if changed: db['legalact'][doc['_id']] = doc
class FollowAllSpider(Spider): name = 'followall' def __init__(self, **kw): super(FollowAllSpider, self).__init__(**kw) url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/' if not url.startswith('http://') and not url.startswith('https://'): url = 'http://%s/' % url self.url = url self.allowed_domains = [urlparse(url).hostname.lstrip('www.')] self.link_extractor = SgmlLinkExtractor() self.cookies_seen = set() def start_requests(self): return [Request(self.url, callback=self.parse)] def parse(self, response): """Parse a PageItem and all requests to follow @url http://www.scrapinghub.com/ @returns items 1 1 @returns requests 1 @scrapes url title foo """ page = self._get_item(response) r = [page] r.extend(self._extract_requests(response)) return r def _get_item(self, response): item = Page(url=response.url, size=str(len(response.body)), referer=response.request.headers.get('Referer')) self._set_title(item, response) self._set_new_cookies(item, response) return item def _extract_requests(self, response): r = [] if isinstance(response, HtmlResponse): links = self.link_extractor.extract_links(response) r.extend(Request(x.url, callback=self.parse) for x in links) return r def _set_title(self, page, response): if isinstance(response, HtmlResponse): title = Selector(response).xpath("//title/text()").extract() if title: page['title'] = title[0] def _set_new_cookies(self, page, response): cookies = [] for cookie in [ x.split(';', 1)[0] for x in response.headers.getlist('Set-Cookie') ]: if cookie not in self.cookies_seen: self.cookies_seen.add(cookie) cookies.append(cookie) if cookies: page['newcookies'] = cookies
class IndeedSpider(CrawlSpider): name = "indeed" allowed_domains = ["indeed.com"] pages = 4 url_template = "http://www.indeed.com/jobs?q=%s&l=Chicago&start=%s" start_urls = [] rules = ( Rule(SgmlLinkExtractor(restrict_xpaths=("//div[@class='row ' or @class='row lastRow']/h2/a/@href"))), Rule(SgmlLinkExtractor(allow=('http://www.indeed.com/jobs',),deny=('/my/mysearches', '/preferences', '/advanced_search','/my/myjobs')), callback='parse_item', follow=False), ) #Initialize the start_urls job_queries = [] with open('job_queries.cfg', 'r') as f: for line in f: job_queries.append(line.strip()) # Build out the start_urls to scrape for job_query in job_queries: for page in range(1,pages): full_url = url_template % (job_query, str(page*10)) start_urls.append(full_url) ''' def __init__(self, *args, **kwargs): # Get the search queries for the jobs from the job_queries.cfg file # Config file must have 1 query per line super(IndeedSpider, self).__init__(*args, **kwargs) ''' def get_job_description(self, html, item): summary_string = item['summary'][0][1:-5] root = lxml.html.document_fromstring(html) target_element = None # For some reason the summary will not match the lxml extracted text, figure out why # This solution is hacky # Get only the first sentence # Indeed cobbles together multiple sentences from the job posting summary_string = summary_string.split(".",1)[0] summary_start_list = summary_string.split(" ")[:3] summary_start = " ".join(summary_start_list) counter = 0 # Find the element that contains the initial words in the summary string for element in root.iter(): counter += 1 if element.text: if (summary_start in element.text): target_element = element print 'YES. element.txt' break elif element.tail: if (summary_start in element.tail): target_element = element print 'YES element.tail' break generation_count = 0 target_ancestor = None job_posting_min_length = 400 job_posting_max_length = 10000 # Find the best parent element that contains the entire job description without the extra html if target_element is not None: if target_element.text_content() > job_posting_min_length: target_ancestor = target_element elif 'shiftgig' in item['source_url']: target_ancestor = target_element else: for ancestor in target_element.iterancestors(): generation_count += 1 ancestor_text = ancestor.text_content() target_ancestor = ancestor # The loop will pre-maturely break once the ancestor elements has minimum threshold of characters if len(ancestor_text) > job_posting_min_length: break return target_ancestor.text_content() def parse_next_site(self, response): item = response.request.meta['item'] item['source_url'] = response.url item['crawl_timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S') job_description = self.get_job_description(response.body, item) item['full_description'] = job_description return item def parse_item(self, response): ''' import pdb pdb.set_trace() ''' self.log('\n Crawling %s\n' % response.url) hxs = HtmlXPathSelector(response) sites = hxs.select("//div[@class='row ' or @class='row lastRow']") #sites = hxs.select("//div[@class='row ']") items = [] #Skip top two sponsored ads for site in sites[:-2]: item = IndeedItem(company='none') item['job_title'] = site.select('h2/a/@title').extract() link_url= site.select('h2/a/@href').extract() item['link_url'] = link_url item['crawl_url'] = response.url item['location'] = site.select("span[@itemprop='jobLocation']/span[@class='location']/span[@itemprop='addressLocality']/text()").extract() # Not all entries have a company company_name = site.select("span[@class='company']/span[@itemprop='name']/text()").extract() if company_name == []: item['company'] = [u''] else: item['company'] = company_name item['summary'] =site.select("table/tr/td/div/span[@class='summary']/text()").extract() #item['source'] = site.select("table/tr/td/span[@class='source']/text()").extract() item['found_date'] =site.select("table/tr/td/span[@class='date']/text()").extract() #item['source_url'] = self.get_source(link_url) if len(item['link_url']): request = Request("http://www.indeed.com" + item['link_url'][0], callback=self.parse_next_site) request.meta['item'] = item yield request return
class DealExtreme(CrawlSpider): name = "dealextreme" allowed_domains = ["dx.com"] start_urls = [url.strip() for url in urllist] rules = ( Rule(SgmlLinkExtractor( allow=(), restrict_xpaths=("//div[@class='pagenumber']", )), follow=True), Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=("//p[@class='title']", )), callback="parse_product", follow=True), ) def parse(self, response): sel = Selector(response) # LinkExtraction # script = sel.xpath("//script[contains(text(),'productAttrs: [')]").extract()[0] # script = re.findall(r'productAttrs(.*)',script) # links = re.findall(r'Url":(.*?)",',script[0]) # for link in links: # link = "http://www.dx.com/p/"+link.replace('"',"") # row = [link] # mywriter.writerow(row) #ProductExtraction pname = 'Impertus ' + sel.xpath("//h1/span/text()").extract()[0] metainfo = "Buy " + pname + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" images = sel.xpath( "//ul[@class='product-small-images']//img/@src").extract() price = sel.xpath("//span[@id='price']/text()").extract()[0] price = float(price) * 115 / 100 * 4 prince = str(price) category = ( sel.xpath('//div[@class="position"]/a[last()-1]/text()').extract() [0] + '/' + sel.xpath('//div[@class="position"]/a[last()]/text()').extract()[0] ) description = [ "DISCLAIMER: LiveYourSport.com does not take responsibility for any support claims and technical troubleshooting." + "This product is not valid for any technical support, warranty after purchase or protected by our after sales services." + "We only offer protection against delivery damages and manufacturing defects claimed within 10 days of delivery of the product." ] description = description.append( sel.xpath("//div[@id='overview']").extract() + sel.xpath("//div[id='specification']").extract()) sku = sel.xpath("//span[@id='sku']/text()").extract()[0] + 'DXMDCHN' row = [ "Product", "", pname + '*', "Impertus", price * 140 / 100, price * 140 / 100, price, description, sku, 'DEALEXTREME', category, pname, '12-19 Working Days', 100, 'N', -270, metainfo, metainfo, metainfo, 'Y', 'By Product', 1, 2, 3, 4, 5 ] for image in images: image = image.replace("//img", "img") row.append(image) mywriter.writerow(row)
class CompuindiaSpider(CrawlSpider): name = 'compuindia' allowed_domains = ['compuindia.com'] start_urls = ['http://www.compuindia.com'] #~start_urls = ['http://www.compuindia.com/touch-pc.html'] #~start_urls = ['http://www.compuindia.com/touch-pc/new-inspiron-242.html'] urlList = [] # DONE # If you are writing a process_value, then you will # have to return the links you want. You can not leave it # without returning. # process_value # NOT USED FTM def pv(value): """This function takes values links from Rules, and can process those links in any manner""" # Removing a link having "deals.html" in it, as we dont want that link to be crawled if re.search('.*deals\.html', value): return None else: return value # NOT USED FTM def pl(value): print sys._getframe().f_code.co_name print 'value' print type(value) print value print 'length: ' + str( len(value)) sys.exit('X') def parse_item(response): #~self.log('Hi, this is an item page! %s' % response.url) #~print 'function' #~print sys._getframe().f_code.co_name #~print 'response' #~print response #~print type(response) #~print "response.url" #~print response.url hxs = HtmlXPathSelector(response) # ATM, all these item values are coming in a List type with just the 0th key item = CompuindiaItem() item['sourceurl'] = [ response.url ] #~item['code'] = hxs.select('//td[@class="data"]/text()')[0].extract() # Code: Unicode item['code'] = hxs.select('//td[@class="data"]/text()')[0].extract().encode('utf-8') # Code: String item['price'] = hxs.select('//span[@class="price"]/text()')[0].extract().encode('utf-8') # left item['color'] = [None] # Try to do matching with class="last odd" #~item['color'] = hxs.select('//tbody/tr[@class="last odd"]') item['name'] = hxs.select("//div[@class='product-name']/h1/text()").extract()[0] #~item['features'] = hxs.select('//ul[@class="config_listing_pd_page"]/li').extract() item['features'] = hxs.select('//ul[@class="config_listing_pd_page"]/li/text()').extract() #~item['specs'] = hxs.select('//div[@class="box-collateral box-additional"]').extract() item['specs'] = hxs.select('//div[@class="box-collateral box-additional"]').extract()[0].encode('utf-8') #~item['description'] = hxs.select('//div[@class="box-collateral box-description"]').extract() item['description'] = hxs.select('//div[@class="box-collateral box-description"]').extract()[0].encode('utf-8') item['moreDescription'] = [None] #~item['additionalInfo'] = hxs.select('//div[@id="additional"]').extract() item['additionalInfo'] = hxs.select('//div[@id="additional"]').extract()[0].encode('utf-8') item['relatedProducts'] = [None] # FTM #IMAGES main_img = [] image_urls = [] main_img = hxs.select("//p[@class='product-image']/a/@href").extract() img_urls = hxs.select("//div[@class='more-views']/ul/li/a/@href").extract() item['image_urls'] = list( set( main_img + img_urls ) ) #IMAGES- #~print 'item' #~print item #~ #~sys.exit('S') return item rules = ( # Extract links matching 'category.php' (but not matching 'subsection.php') # and follow links from them (since no callback means follow=True by default). ### Now 3rd Page # Extracting the actual data and images from Product Page #~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//div[@class='product-name']" ] ), callback=parse_item, follow=False ), #~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//div[@class='product-view']" ], deny = [".*tata-photon.*", ] ), callback=parse_item, follow=True ), # FIN THIS #~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//div[@class='product-view']" ], deny = [".*tata-photon.*", ] ), callback = parse_item, follow=False ), #~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//div[@class='col-main']/div[@class='product-view']" ], deny = [".*tata-photon.*", ] ), callback = parse_item, follow=False ), ### Now 2nd Page #~Rule( SgmlLinkExtractor( restrict_xpaths = "//h4[@class='product-name']", process_value= pv2 ), follow=False ), #~Rule( SgmlLinkExtractor( restrict_xpaths = "//div[@class='category-products']", process_value= pvN ), follow=False ), #~Rule( SgmlLinkExtractor( restrict_xpaths = "//div[@class='category-products']", process_value= pvN ), follow=False ), #~Rule( SgmlLinkExtractor( restrict_xpaths="//h4[@class='product-name']", process_value= pvn ), follow=True ), # This time i dont need to do process_links #~Rule( SgmlLinkExtractor( restrict_xpaths="//h4[@class='product-name']" ), process_links=pl, follow=True ), # ===PAGE 2: DONE=== # THESE 2, put it as, list in "restrict_xpaths" # IF: # THIS #~Rule( SgmlLinkExtractor( restrict_xpaths="//h4[@class='product-name']" ), follow=True ), # THIS #~Rule( SgmlLinkExtractor( restrict_xpaths="//a[@class='next i-next']" ), process_links=pl, follow=True ), # ELSE: #~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//h4[@class='product-name']", "//a[@class='next i-next']", ] ), process_links=pl, follow=True ), #~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//h4[@class='product-name']", "//a[@class='next i-next']", ] ), process_links=pl, follow=False ), #~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//h4[@class='product-name']", "//a[@class='next i-next']", ] ), follow=True ), # FIN THIS #PAGE2: NEXT LINKS Rule( SgmlLinkExtractor( restrict_xpaths=[ "//a[@class='next i-next']", ] ), follow=True ), # FIN THIS # PAGE2: PRODUCT LINKS #~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//h4[@class='product-name']" ] ), follow = True ), Rule( SgmlLinkExtractor( restrict_xpaths=[ "//div[@class='category-products']" ], deny = [".*dir=.*", ".*order=.*", ], ), callback=parse_item , follow = False ), #~Rule( SgmlLinkExtractor( restrict_xpaths=[ "//div[@class='category-products']/ul[@class='products-grid']" ] ), follow = True ), # Not working #~Rule( SgmlLinkExtractor( restrict_xpaths="//a[starts-with(., 'next')]" ), process_links=pl, follow=True ), # ===PAGE 1: DONE=== # CompuIndia.com ### 1st Page # Awesome!!! # Now links are getting extracted from first page, # now go to next page, add another rule & get the item links, with another rule, # then third page which is actual product page, extract real data from there #~Rule( SgmlLinkExtractor( deny = ( 'deals\.html', ), restrict_xpaths = "//div[@class='parentMenu']", process_value= pv ), follow=True ), #~Rule( SgmlLinkExtractor( deny= re.compile('.*deals\.html', re.I), restrict_xpaths = "//div[@class='parentMenu']", process_value= pv ), follow=True ), # Since i m putting process_value, i have to do a return value and return None, wherever necessary, otherwise the parser stops following links # as they haven't been returned # THIS #~Rule( SgmlLinkExtractor( restrict_xpaths = "//div[@class='parentMenu']", process_value= pv ) , follow=True ), # FIN THIS Rule( SgmlLinkExtractor( restrict_xpaths = "//div[@class='parentMenu']", deny = [".*deals\.htm.*"], ) , follow=True ), #~Rule( SgmlLinkExtractor( restrict_xpaths = "//div[@class='parentMenu']", deny = [".*deals\.htm.*"], process_value= pv ) , follow=True ), #~Rule( SgmlLinkExtractor(process_value=pv) , follow=False ), #~Rule( SgmlLinkExtractor( deny = 'deals\.html', restrict_xpaths = "//div[@class='menu']", process_value= pv ), follow=True ), #~Rule( SgmlLinkExtractor( deny = 'deals\.html', restrict_xpaths = "//div[@id='custommenu']", process_value= pv ), follow=True ), # Y is this following? #~Rule( SgmlLinkExtractor( restrict_xpaths = "//div[@id='custommenu']", process_value= pv ), follow=False ), # Working too. #~Rule( SgmlLinkExtractor( process_value= pv ), follow=False ), #~Rule( SgmlLinkExtractor( restrict_xpaths = "//div[@class='parentMenu']" ) , callback = "chk_urls" ), #~Rule( SgmlLinkExtractor( restrict_xpaths = "//div[@class='custommenu']" ) , process_links = "chk_urls" ), #~Rule( SgmlLinkExtractor( restrict_xpaths = ["//div[@class='parentMenu']/a/@href", ] ) , follow=True, process_request = "chk_urls" ), #~Rule( SgmlLinkExtractor( ) , process_request = "chk_urls" ), # Extract links matching 'item.php' and parse them with the spider's method parse_item #~Rule( SgmlLinkExtractor( allow=('item\.php', ) ), callback='parse_item' ), ) #~def parse_start_url(self, response): #~print "parse_start_url" #~print 'response' #~print response #~ #~pass #~def parse_item(self, response): #~print "response" #~print response #~ #~item = CompuindiaItem() #~item['url'] = response.url #~ #~return item #~print "self.urlList" #~print self.urlList #~pass #~ #~return
class doubanSpider(CrawlSpider): name = "doubanmovie" allowed_domains = ["movie.douban.com"] start_urls = ["http://movie.douban.com/top250"] rules = [ Rule( SgmlLinkExtractor( allow=(r'http://movie.douban.com/top250\?start=\d+.*'))), Rule(SgmlLinkExtractor(allow=(r'http://movie.douban.com/subject/\d+')), callback="parse_item"), ] def parse_item(self, response): sel = Selector(response) movie_name = sel.select( "//div[@id='content']/h1/span[1]/text()").extract() movie_director = sel.select( "//*[@id='info']/span[1]/span[2]/a/text()").extract() movie_writer = sel.select( "//*[@id='info']/span[2]/span[2]/a/text()").extract() movie_score = sel.xpath( "//*[@id='interest_sectl']/div/div[2]/strong/text()").extract() movie_classification = sel.xpath( "//span[@property='v:genre']/text()").extract() movie_description_paths = sel.select("//*[@id='link-report']") movie_description = [] for movie_description_path in movie_description_paths: movie_description = movie_description_path.select( ".//*[@property='v:summary']/text()").extract() movie_roles_paths = sel.select("//*[@id='info']/span[3]/span[2]") movie_roles = [] for movie_roles_path in movie_roles_paths: movie_roles = movie_roles_path.select( ".//*[@rel='v:starring']/text()").extract() movie_detail = sel.select("//*[@id='info']").extract() item = WorkItem() item['movie_name'] = ''.join(movie_name).strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace(':', ';') item['movie_director'] = movie_director[0].strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace( ':', ';') if len(movie_director) > 0 else '' item['movie_score'] = movie_score[0].strip().replace(',', ';').replace( '\'', '\\\'').replace('\"', '\\\"').replace( ':', ';') if len(movie_director) > 0 else '' item['movie_classification'] = movie_classification[0].strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace( ':', ';') if len(movie_director) > 0 else '' item['movie_description'] = movie_description[0].strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace( ':', ';') if len(movie_description) > 0 else '' item['movie_writer'] = ';'.join(movie_writer).strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace(':', ';') item['movie_roles'] = ';'.join(movie_roles).strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace(':', ';') movie_detail_str = ''.join(movie_detail).strip() movie_language_str = ".*语言:</span>(.+?)<span.*".decode("utf8") movie_date_str = ".*上映日期:</span> <span property=\"v:initialReleaseDate\" content=\"(\S+?)\">(\S+?)</span>.*".decode( "utf8") movie_long_str = ".*片长:</span> <span property=\"v:runtime\" content=\"(\d+).*".decode( "utf8") pattern_language = re.compile(movie_language_str, re.S) pattern_date = re.compile(movie_date_str, re.S) pattern_long = re.compile(movie_long_str, re.S) movie_language = re.search(pattern_language, movie_detail_str) movie_date = re.search(pattern_date, movie_detail_str) movie_long = re.search(pattern_long, movie_detail_str) item['movie_language'] = "" if movie_language: item['movie_language'] = movie_language.group(1).replace( '<br>', '').strip().replace(',', ';').replace('\'', '\\\'').replace( '\"', '\\\"').replace(':', ';') item['movie_date'] = "" if movie_date: item['movie_date'] = movie_date.group(1).strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace(':', ';') item['movie_long'] = "" if movie_long: item['movie_long'] = movie_long.group(1) yield item
class SpiderSpider(CrawlSpider): count = 0 name = "pcconnection_tv" dic = set() allowed_domains = init_allowed_domains start_urls = init_start_urls rules = ( #only extract links here Rule(SgmlLinkExtractor(allow=allowed_url), callback="parse"), ) @property def sleep_time(self): return random.random() * MAX_SLEEP_TIME def parse(self, response): ''' extract title content url ''' print '>'*50 print 'response url: ', response.url hxs = HtmlXPathSelector(response) print '>>>> repsonse.url: ', response.url #get urls content_urls = hxs.select(content_url_format).extract() list_urls = hxs.select(list_url_format).extract() list_urls = [ up.urljoin(response.url, url) for url in list_urls] content_urls = [ up.urljoin(response.url, url) for url in content_urls] print "@" * 60 time.sleep(self.sleep_time) self.start_urls.extend(list_urls) for url in list_urls: yield Request(url, self.parse) content_re = re.compile(r'http://www[.]pcconnection[.]com/.*cac=Result') for url in content_urls: if content_re.match(url): if len(self.dic) > 600: self.start_urls = [] raise CloseSpider('reach pages limit, end the spider.') self.count += 1 self.dic.add( hash(url)) #extract data item = SpiderItem() item['url'] = url item['kind'] = self.name yield item else: print "!!!!!!! not match content url:" print url
def parse(self, response): print('inside a thread') hxs = HtmlXPathSelector(response) filename_ = response.url.split("/")[-2][1:] filename = os.path.abspath(databasePath + "\data\%s" % filename_) dumpFilePath = os.path.abspath(databasePath + "\dump\%s" % filename_) try: a = response.meta['page'] except KeyError: a = 0 os.mkdir(dumpFilePath) with open(filename, 'a') as f: #header forumTitle = hxs.select( '//div[@class="module forums"]/h2/text()').extract( )[0].encode('ascii', 'ignore').replace('\n', '') extraInfo = hxs.select( '//div[@class="module forums discussion tid"]/h4/text()' ).extract()[0].encode('ascii', 'ignore').replace('\n', '') f.write("title:" + forumTitle + "\n") f.write("extraInfo:" + extraInfo + "\n") f.write(response.url + "\n") f.write(filename + "\n") f.write(dumpFilePath + "\n\n") with open(dumpFilePath + "\\" + str(a) + '.html', 'a') as fd: fd.write(response.body) with open(filename, 'a') as f: for entry in hxs.select('//div[contains(@class,"forums-thread")]'): msgID = entry.select('span/@id').extract()[0] msgDate = entry.select('h4/text()').extract()[0].encode( 'ascii', 'ignore').replace('\n', '') msgText = ' '.join( entry.select('span/text()').extract()).encode( 'ascii', 'ignore').replace('\n', '') try: mgAuthor = entry.select( 'h3/span/a/text()').extract()[0].encode( 'ascii', 'ignore').replace('\n', '') except: mgAuthor = 'none' try: msgTitle = entry.select( 'h3/strong/text()').extract()[0].encode( 'ascii', 'ignore').replace('\n', '') except: msgTitle = "none" f.write('msgID:' + msgID + '\n') f.write('msgTitle:' + msgTitle + '\n') f.write('mgAuthor:' + mgAuthor + '\n') f.write('msgDate:' + msgDate + '\n') f.write('msgText:' + msgText + '\n\n') s = SgmlLinkExtractor( restrict_xpaths=['//li[contains(@class, "next")]']) Links = s.extract_links(response) if len(Links) > 0: print 'going to the next page' r = Request(googc + Links[0].url, callback=self.parse) r.meta['page'] = a + 1 yield r
class HuoBiaoSpider(CrawlSpider): name = 'huo_biao' allowed_domains = ['www.huobiao.cn'] start_urls = ['http://www.huobiao.cn/search/word/%E7%89%A9%E4%B8%9A/'] rules = (Rule(SgmlLinkExtractor(allow=('/detail?id=\w+', )), callback='parse_page', follow=True), ) headers = { "Accept": "application/json, text/plain, */*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Referer": "http://www.huobiao.cn/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36", } # 重写了爬虫类的方法, 实现了自定义请求, 运行成功后会调用callback回调函数 def start_requests(self): return [ Request("http://www.huobiao.cn/do_login", meta={'cookiejar': 1}, callback=self.post_login) ] # FormRequeset出问题了 def post_login(self, response): # 登陆成功后, 会调用after_login回调函数 return [ FormRequest.from_response( response, # "http://www.zhihu.com/login", meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, # 注意此处的headers formdata={ 'phone': '1341953*****', 'password': '******', 'checkout': 'on' }, callback=self.after_login, ) ] def after_login(self, response): for url in self.start_urls: yield self.make_requests_from_url(url) def parse_page(self, response): problem = Selector(response) item = HuobiaoItem() item['title'] = problem.xpath('//span[@class="name"]/text()').extract() item['url'] = response.url item['name'] = problem.xpath('//span[@class="name"]/text()').extract() item['name'] item['title'] = problem.xpath( '//h2[@class="zm-item-title zm-editable-content"]/text()').extract( ) item['description'] = problem.xpath( '//div[@class="zm-editable-content"]/text()').extract() item['answer'] = problem.xpath( '//div[@class=" zm-editable-content clearfix"]/text()').extract() return item
class UnderArmour(CrawlSpider): name = "underarmour1" allowed_domains = ["underarmour.com"] start_urls = [#"https://www.underarmour.com/en-us/mens/footwear/basketball-shoes",] #"https://www.underarmour.com/en-us/mens/apparel/tops/hoodies"] "https://www.underarmour.com/en-us/outlet/mens/tops"] # rules = (Rule (SgmlLinkExtractor(allow=(),restrict_xpaths=('//div[@class="grid-content"]',)) # , callback="parse_items", follow= True),) rules = (Rule (SgmlLinkExtractor(allow=(),restrict_xpaths=('//div[@class="next"]',)), follow= True), Rule (SgmlLinkExtractor(allow=(),restrict_xpaths=('//div[@class="bottom-section"]',)) , callback="parse_items", follow= True),) csvfile = None printHeader = True def to_csv(self, item): if self.printHeader: self.csvfile = open('UnderArmour.csv','w') if self.csvfile: strWrite = '' #headers if self.printHeader: strWrite +='Item Type,Product Name,Brand Name,Price,Retail Price,Sale Price,Product Description,Product Code/SKU,' strWrite +='Category,Option Set,Product Availability,Current Stock Level,Free Shipping,Sort Order, Meta Description,Page Title, Product Image Description - 1,Product Image Is Thumbnail - 1,' strWrite +='Track Inventory,Product Image Sort - 1,Product Image Sort - 2,Product Image Sort - 3,Product Image Sort - 4,Product Image Sort-5,' strWrite +='Product Image File - 1,Product Image File - 2,Product Image File - 3,Product Image File - 4,Product Image File - 5 , \n' self.printHeader = False #print basic product data strWrite += 'Product,'+item["Product_Name"]+ ',' + item["Brand_Name"] + ',' strWrite += item["Price"] + ','+ item["Retail_Price"] + ',' + item ["Sale_Price"] + ',' strWrite += ';'.join(item["Product_Description"]).replace(',',';').replace('\n',"").replace("</div>","").replace("<h2>","").replace("</h2>","") + ',' + item["Product_Code"] + ',' #for Images strWrite += item["Category"] + ',' + ';'.join(item["Option_Set"]) + ',' + item["Product_Availability"] +',' strWrite += item["Current_Stock"] + ',' + item["Free_Shipping"] + ',' + item["Sort_Order"] + ',' + item['MetaDescription'] + ',' + item['TitleTag'] + ',' strWrite += item["Product_Image_Description_1"] + ',' + item["Product_Image_Is_Thumbnail_1"] + ',' + item["Track_Inventory"] + ',' strWrite += item["Product_Image_Sort_1"] + ',' + item["Product_Image_Sort_2"] + ',' + item["Product_Image_Sort_3"] + ',' strWrite += item["Product_Image_Sort_4"] + ',' + item["Product_Image_Sort_5"] + ',' #strWrite += ','.join(item["Product_Image_File1"]) + +'\n' strWrite += ';'.join(item["Product_Image_File1"]) + ',' strWrite += '\n' #print variant for sizes in item['variants']: strWrite += 'SKU,[S]Size= US ' + sizes + ',,,,,,' + item["id1"]+ "-" + sizes + item["color"]+',,,,,,,,,,,,,,,,\n' self.csvfile.write(strWrite.encode('utf8')) #--BASIC PRODUCT DATA STARTS HERE-- def parse_items(self,response): #def parse(self,response): sel = Selector(response) item = BigCItem() item ["Item_Type"] = "Product" #Product Name color = sel.xpath("//span[@class='current-color-selection']/span[2]/text()") pname = sel.xpath("//h1[@itemprop='name']/text()") item ["Product_Name"] = pname.extract()[0] + " " + color.extract()[0]+"*" item["MetaDescription"] = "Get your hands on the " + pname.extract()[0] + " " + color.extract()[0] + ". Buy it Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" item["TitleTag"] = "Buy the " + pname.extract()[0] + " " + color.extract()[0] + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" #Pricing mrp = response.xpath("//span[@class='buypanel_productprice--orig']/text()") if mrp: item ["Retail_Price"] = str(float(re.findall(r'\d+\.?\d+|\d+',sel.xpath("//span[@class='buypanel_productprice--orig']/text()").extract()[0])[0]) * 65*131/100+700) item ["Sale_Price"] = str(float(re.findall(r'\d+\.?\d+|\d+',sel.xpath("//span[@class='buypanel_productprice-value sale-price']/text()").extract()[0])[0]) * 65*131/100+700) item ["Price"] = str(float(re.findall(r'\d+\.?\d+|\d+',sel.xpath("//span[@class='buypanel_productprice--orig']/text()").extract()[0])[0]) * 65*131/100+700) else: item ["Retail_Price"] = str(float(re.findall(r'\d+\.?\d+|\d+',sel.xpath("//span[@class='buypanel_productprice-value']/text()").extract()[0])[0]) * 65*131/100+700) item ["Price"] = str(float(re.findall(r'\d+\.?\d+|\d+',sel.xpath("//span[@class='buypanel_productprice-value']/text()").extract()[0])[0]) * 65*131/100+700) item ["Sale_Price"] = "" #brandName item ["Brand_Name"] = "Under Armour" #Product Code Extraction id = response.xpath("//meta[@property='og:url']/@content").extract() for url in id: item ["Product_Code"] = url.split('id')[-1] + color.extract()[0] item["id1"] = url.split('id')[-1] item["color"] = color.extract()[0] #Product Description #Product Description desc1 = sel.xpath("//span[@itemprop='description']/text()") desc2 = sel.xpath("//div[@class='buypanel_productdescription is-collapsed']/ul") item["Product_Description"] = desc1.extract() + desc2.extract() #ImageFile item["Product_Image_File1"] = [x.replace("a248.e.akamai.net/f/248/9086/10h/","").split('?')[0] for x in sel.xpath("//div[@class='buypanel_productcaroitem--mobile']/img/@src").extract()] #CATEGORY #cat= sel.xpath("//h1[@itemprop='name']/text()") #if item["Category"] = "Shoes/Men's Shoes/Basketball Shoes; Team Sports/Basketball/Basketball Shoes" #Other Constants item["Option_Set"] = sel.xpath("//h1[@itemprop='name']/text()").extract() item["Product_Availability"] = "12-17 Working Days" item["Current_Stock"] = "100" item["Free_Shipping"] = "N" item["Sort_Order"] = "-300" item["Product_Image_Description_1"] = "Buy " + sel.xpath("//h1[@itemprop='name']/text()").extract()[0] + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" item["Product_Image_Is_Thumbnail_1"] = "Y" item["Track_Inventory"] = "By Product" item["Product_Image_Sort_1"] = "1" item["Product_Image_Sort_2"] = "2" item["Product_Image_Sort_3"] = "3" item["Product_Image_Sort_4"] = "4" item["Product_Image_Sort_5"] = "5" #---Sizes/Variants Start Here---- item["variants"] = {} script = response.xpath("//script[contains(.,'JSON.parse')]").extract()[0] scriptJson = script.split('JSON.parse(')[3].split('),')[0] scriptJsonDict = json.loads(scriptJson) for x in scriptJsonDict['MATERIALS']: item["variants"][x['CODE']] = {} item["variants"][x['CODE']]["sizes"] = {} item["variants"][x['CODE']]["colorCode"] = x['COLOR']['PRIMARY']['CODE'] item["variants"][x['CODE']]["color"] = x['COLOR']['PRIMARY']['NAME'] item["variants"][x['CODE']]["colorRGB"] = x['COLOR']['PRIMARY']['RGB'] item["variants"][x['CODE']]["Retail_Price"] = x['PRICE']['ORIG']['MAX'] item["variants"][x['CODE']]["Sale_Price"] = x['PRICE']['CURRENT']['MAX'] item["variants"][x['CODE']]["image"] = [] for c in x['ASSETS']: item["variants"][x['CODE']]["image"].append('https://origin-d4.scene7.com/is/image/Underarmour/'+c['NAME']+'?scl=2') for c in x['SIZES']: item["variants"][x['CODE']]["sizes"][c['CODE']] = {} item["variants"][x['CODE']]["sizes"][c['CODE']]['inventory'] = c['INVENTORY'] item["variants"][x['CODE']]["sizes"][c['CODE']]['name'] = c['NAME'] self.to_csv(item); return item
def test_restrict_xpaths(self): lx = SgmlLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]', )) self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), ])