def parse_url(self, response): links = response.xpath('//a[@class="elemRelative"]/@href').extract() for link in links: item = SpiderframeItem() item['url'] = link # print(item) yield item
def parse(self, response): word = response.meta.get("keyword") show_word, uk_phonetic, us_phonetic = '', '', '' di_title = response.xpath( '//div[@class="di-title"]//span[@class="hw dhw"]') if di_title: show_word = ''.join(di_title[0].xpath("./text()").extract()) uk_span = response.xpath( '//div[@class="pos-header dpos-h"]//span[@class="uk dpron-i "]/span[@class="pron dpron"]' ) if uk_span: uk_phonetic = ''.join(uk_span[0].xpath('.//text()').extract()) if uk_phonetic: uk_phonetic = "[" + uk_phonetic.strip('/') + "]" us_span = response.xpath( '//div[@class="pos-header dpos-h"]//span[@class="us dpron-i "]/span[@class="pron dpron"]' ) if us_span: us_phonetic = ''.join(uk_span[0].xpath('.//text()').extract()) if us_phonetic: us_phonetic = "[" + us_phonetic.strip('/') + "]" item = SpiderframeItem() item['title'] = word # title 字段 存单词 item['category'] = show_word # category 存显示的单词 item['content'] = uk_phonetic # content 字段存 英式英语 item['item_name'] = us_phonetic # category 字段 美式英语 yield item
def parse(self, response): links = response.xpath('//div[@class="box"]//ul/li//a//@href').extract() for link in links: item = SpiderframeItem() item['url'] = "http://www.enet.gr/"+link # print(item) yield item
def parse_link(self, response): links = response.xpath('//h2/a/@href').extract() for link in links: item = SpiderframeItem() item['url'] = link # print(item) yield item
def parse_url(self, response): links = response.xpath('//article/a//@href').extract() for link in links: if "https://" in link: item = SpiderframeItem() item['url'] = link yield item
def parse(self, response): show_word_r = response.xpath( '//div[@class="webtop"]/h1/text()').extract() if show_word_r: show_word = show_word_r[0] else: show_word = '' ph_en_r = response.xpath( '//div[@class="webtop"]/span[@class="phonetics"]/div[@class="phons_br"]/span/text()' ).extract() if ph_en_r: ph_en = ph_en_r[0] else: ph_en = '' ph_am_r = response.xpath( '//div[@class="webtop"]/span[@class="phonetics"]/div[@class="phons_n_am"]/span/text()' ).extract() if ph_am_r: ph_am = ph_am_r[0] else: ph_am = '' item = SpiderframeItem() item['title'] = response.meta.get("keyword") # title 字段 存单词 item['category'] = show_word # category 存显示的单词 item['content'] = ph_en # content 字段存 英式英语 item['item_name'] = ph_am # category 字段 美式英语 return item
def parse_url(self, response): links = response.xpath('//div[@class="news"]/a/@href').extract() for link in links: link = "http://www.hurriyetdailynews.com" + link item = SpiderframeItem() item['url'] = link yield item
def parse_url(self, response): links = response.xpath('//a[@class="item"]/@href').extract() for link in links: link = "https://www.masrawy.com" + link item = SpiderframeItem() item['url'] = link yield item
def parse(self, response): # sentens1 = response.xpath('//div[@class="layout sort"]//li//text()').extract() # sentens2 = response.xpath('//div[@class="layout patt"]//li//text()').extract() # sentens3 = response.xpath('//div[@class="layout auth"]//li//text()').extract() # sentenses = sentens1 + sentens2 + sentens3 # for sentens in sentenses: # if (sentens >= u'\u0041' and sentens <= u'\u005a') or (sentens >= u'\u0061' and sentens <= u'\u007a'): # md = md5(sentens) # item = SpiderframeItem() # item['content'] = sentens # item['title'] = response.meta.get("keyword") # item['category'] = 'dict' # item['item_id'] = md # yield item # 抓取读音 word = response.url.split("=")[-1] word_tag = response.xpath( '//div[@class="word-cont"]/h1/text()').extract() # 显示单词 if word_tag: phonetic = response.xpath('//div[@class="phonetic"]/span') en_phonetic, am_phonetic, phonetic_word = '', '', [] if phonetic: for item in phonetic: pronounce_lang = item.xpath( "./text()").extract() # 根据标签区分英式和美式 if pronounce_lang: pronounce_text = ''.join(pronounce_lang).strip() pronounce_text = pronounce_text.replace(" '", '').replace( "’", '') if pronounce_text == "英": en_phonetic = ''.join( item.xpath( './bdo[@lang="EN-US"]/text()').extract()) en_word = ''.join( item.xpath('./i[1]/@naudio').extract()) if en_word: phonetic_word.append(en_word.split("=")[-1]) elif pronounce_text == "美": am_phonetic = ''.join( item.xpath( './bdo[@lang="EN-US"]/text()').extract()) am_word = ''.join( item.xpath('./i[1]/@naudio').extract()) if am_word: phonetic_word.append(am_word.split("=")[-1]) if phonetic_word: phonetic_word = phonetic_word[0] else: phonetic_word = '' item = SpiderframeItem() item['title'] = word # title 字段 存单词 # item['category'] = word_tag[0] # category 存显示的单词 item['category'] = phonetic_word # category 音标的单词 item['content'] = en_phonetic # content 字段存 英式英语 item['item_name'] = am_phonetic # category 字段 美式英语 yield item
def parse_link(self, response): links = response.xpath( '//a[contains(@class, "url track-click")]/@href').extract() for link in links: item = SpiderframeItem() item['url'] = link # print(item) yield item
def parse_url(self, response): links = response.xpath( '//a[contains(@class, "dre-item__title")]/@href').extract() for link in links: item = SpiderframeItem() item['url'] = link # print(item) yield item
def parse_url(self, response): links = response.xpath('//h3/a/@href').extract() for link in links: link = "https://www.alittihad.ae" + link item = SpiderframeItem() item['url'] = link # print(item) yield item
def parse(self, response): links = response.xpath( '//a[@class="iframe cboxElement"]/@href').extract() for link in links: item = SpiderframeItem() item['url'] = link # print(item) yield item
def parse(self, response): nodes = response.xpath('//*[@id="tmTable"]/div/div[2]/span/span') for node in nodes: sentence = ''.join(node.xpath(".//span/text()").extract()) item = SpiderframeItem() item['url'] = response.url item['content'] = sentence yield item
def parse_url(self, response): links = response.xpath('//a[@class="ankeiler__link"]/@href').extract() for link in links: # print(link) item = SpiderframeItem() item['url'] = link # print(item) yield item
def parse_url(self, response): links = response.xpath('//a[@class="c-teaser__link"]/@href').extract() for link in links: link = "https://www.gp.se" + link item = SpiderframeItem() item['url'] = link # print(item) yield item
def parse_url(self, response): links = response.xpath( '//div[@class="field field-name-field-webrubrik"]/a/@href' ).extract() for link in links: item = SpiderframeItem() item['url'] = "https://www.information.dk" + link # print(item) yield item
def parse(self, response): patterns = response.xpath( '//a[@class="differentTall"]/@href').extract() for pattern in patterns: link = "http://www.alkhaleej.ae" + pattern item = SpiderframeItem() item['url'] = link # print(item) yield item
def parse_url(self, response): links = response.xpath('//h3/a/@href').extract() for link in links: link = urllib.parse.quote(link) link = "https://www.tagesanzeiger.ch" + link item = SpiderframeItem() item['url'] = link # print(item) yield item
def parse_url(self, response): links = response.xpath( '//div[contains(@class, "art")]//a/@href').extract() for link in links: if "https://jyllands-posten.dk" in link: item = SpiderframeItem() item['url'] = link # print(item) yield item
def parse_url(self, response): page = response.text json_content = demjson.decode(page) json_contents = json_content["data"] json_list = json_contents["item"] cdn_url = json_list.get("cdn_url") large_mp4 = cdn_url.get("large") item = SpiderframeItem() item["url"] = large_mp4 yield item
def parse(self, response): description = response.xpath( '/html/body/section[2]/section[1]/section[1]/p/text()').extract() paragraph = response.xpath( '/html/body/section[2]/section[1]/section[1]/article/p/text()' ).extract() content = ' '.join(description + paragraph).replace('\n', '') item = SpiderframeItem() item['url'] = response.url item['content'] = content return item
def parse(self, response): title = response.xpath( '//h1[contains(@class, "title")]/text()').extract() # subtitle = response.xpath('//p[@class="subtitle"]/text()').extract() content = response.xpath('//section/p/text()').extract() item = SpiderframeItem() item['url'] = response.url item['category'] = response.url.split('/')[2].split('.')[0] item['title'] = ''.join(title) item['content'] = ''.join(content) yield item
def parse(self, response): title = response.xpath('//h1/text()').extract() content = response.xpath('//p/text()').extract() item = SpiderframeItem() # item['id'] = 11111111111111 item['url'] = response.url item['category'] = response.url.split('/')[-2] item['title'] = ''.join(title) item['content'] = ''.join(content) # print(item) yield item
def parse_content(self, response): resp = demjson.decode(response.text) data = resp.get("items", []) for v1 in data.values(): for v2 in v1.values(): if type(v2) is list and v2: data1=v2[0] if data1.get("target"): expandedUri = data1["target"]["expandedUri"] item = SpiderframeItem() item['url'] = expandedUri yield item
def parse(self, response): title = response.xpath('//h1/text()').extract() content = response.xpath('//p//text()').extract() content = ''.join(content) content = content.replace("\n", " ") content = content.replace("\t", " ") item = SpiderframeItem() item['url'] = response.url item['category'] = response.url.split('/')[3] item['title'] = ''.join(title) item['content'] = ''.join(content) yield item
def parse_link(self, response): next_urls = response.xpath('//li[@class="nxtnav "]/a/href') for next_url in next_urls: yield scrapy.Request(url=next_url, callback=self.parse_link, dont_filter=True) links = response.xpath('//div[@class="mask-title"]/a/@href').extract() for link in links: item = SpiderframeItem() item['url'] = link # print(item) yield item
def parse(self, response): title = response.xpath('//h2[@class="page-title"]/text()').extract() content = response.xpath('//p/text()').extract() content = ''.join(content) content = content.replace("\n", " ") content = content.replace("\t", " ") item = SpiderframeItem() item['url'] = response.url item['category'] = response.url.split('/')[-2] item['title'] = ''.join(title) item['content'] = content # print(item) yield item
def parse_item(self, response): links = response.xpath( "//section[(contains(@class, 'sequence'))]//a/@href").extract() for url in links: item = SpiderframeItem() item['url'] = url yield item next_url = response.xpath( '//section/div/nav[1]/a[@rel="next"]/@href').extract() if next_url: yield scrapy.Request(url=next_url[0], callback=self.parse_item, dont_filter=True)
def parse_content(self, response): lis = response.xpath('//span//tr/td/table//tr[3]/td/table//tr') for li in lis: lis_text = li.xpath('.//td[not(@class)]//text()').extract() sentens = "".join(lis_text) if "未完全匹配句对" not in sentens and sentens != "": sentens = sentens.strip() md = md5(sentens) item = SpiderframeItem() item['content'] = sentens item['title'] = response.meta.get("keyword") item['category'] = 'cnki' item['item_id'] = md yield item