def parse(self, response): # Extract current revisions for revision in response.css('ul#pagehistory li'): comment = revision.css('span.comment').extract_first() date = revision.css('a.mw-changeslist-date::text').extract_first() rvid = revision.css( 'aw.mw-changeslist-date::attr(href)').extract_first() ip = revision.css('a.mw-anonuserlink bdi::text').extract_first() user = revision.css('a.mw-userlink bdi::text').extract_first() size = revision.css('span.history-size::text').extract_first() tags = revision.css('span.mw-tag-marker::text').extract() item = WikiItem( comment=comment, date=date, rvid=rcid, ip=ip, user=user, size=size, tags=tags, ) yield item # Get next revision history page next_page = response.css('a.mw-nextlink::attr(href)').extract_first() if next_page is not None: next_page = response.urljoin(next_page) # Be kind and treat wikimedia with care # time.sleep(1) yield scrapy.Request(next_page, callback=self.parse)
def parse(self, response): rows = response.xpath('//div[@id="mw-content-text"]/div/table/tr') patterns = [ './td[1]/i/a/text()', './td[1]/i/b/a/text()', './td[1]/i/span[2]//text()', './td[1]/i/b/span/text()' ] for row in rows: for pattern in patterns: film = row.xpath(pattern).extract() # film = row.xpath(pattern).extract_first() film = self.verify(film) if film: break year = row.xpath('./td[2]/a/text()').extract() year = self.verify(year) awards = row.xpath('./td[3]/text()').extract() awards = self.verify(awards) nominations = row.xpath('./td[4]/text()').extract() nominations = self.verify(nominations) item = WikiItem() item['film'] = film item['year'] = year item['awards'] = awards item['nominations'] = nominations yield item
def parse_wikipedia_page(self, response): item = WikiItem() soup = BeautifulSoup(response.body) item['url'] = response.url item['title'] = soup.find('h1', {'id': 'firstHeading'}).string item['desc'] = soup.find('div', {'id': 'mw-content-text'}).find('p') #Create array of all links in description field unique_links = set() for link in soup.find('div', { 'id': 'mw-content-text' }).find('p').findAll('a', attrs={'href': re.compile('^/wiki/')}): unique_links.add('http://en.wikipedia.org{}'.format( link.get('href'))) links = [] for link in unique_links: links.append(link) item['links'] = links #Get fotter information on page item['footer'] = soup.find('ul', { 'id': 'footer-info' }).find('li', { 'id': 'footer-info-lastmod' }).string return item
def parse(self, response): """Parse the wikipedia page of a historical figure""" # Extract the figure's image, birth and death date info_box = Selector(text=response.css('.infobox').extract_first()) image_url = info_box.xpath('//a[@class="image"]//@src').extract_first() image_url = "https:" + image_url death_date = info_box.xpath( '//*[@class="dday deathdate"]//text()').extract_first() death_date_raw = info_box.select( "//th[contains(text(), 'Died')]/following-sibling::td/text()" ).extract_first() birth_date = info_box.xpath( '//*[@class="bday"]//text()').extract_first() birth_date_raw = info_box.select( "//th[contains(text(), 'Born')]/following-sibling::td/text()" ).extract_first() # Create the wiki item wiki_item = WikiItem() wiki_item["image_url"] = image_url wiki_item["death_date"] = death_date wiki_item["death_date_raw"] = death_date_raw wiki_item["birth_date"] = birth_date wiki_item["birth_date_raw"] = birth_date_raw wiki_item["curid"] = response.meta['curid'] yield wiki_item
def parse(self, response): # first method in the spyder file # Find all the table rows rows = response.xpath('//*[@id="mw-content-text"]/div/table/tbody/tr') # The movie title could be of different styles so we need to provide all the possibilities. patterns = ['./td[1]/i/a/text()', './td[1]/i/b/a/text()', './td[1]/i/span[2]//text()', './td[1]/i/b/span/text()'] for row in rows: # extract() will return a Python list, extract_first() will return the first element in the list # If you know the first element is what you want, you can use extract_first() for pattern in patterns: film = row.xpath(pattern).extract_first() if film: break # If the movie title is missing, then we just skip it. if not film: continue # Relative xpath for all the other columns year = row.xpath('./td[2]/a/text()').extract_first() awards = row.xpath('./td[3]/text()').extract_first() nominations = row.xpath('./td[4]/text()').extract_first().strip() # Initialize a new WikiItem instance for each movie. item = WikiItem() item['film'] = film item['year'] = year item['awards'] = awards item['nominations'] = nominations yield item
def parse_wiki_page(self, response): #print("Parsed: ", response.url) item = WikiItem() item['url'] = response.url item['title'] = BeautifulSoup( response.xpath('//h1[@id="firstHeading"]').extract_first(), "lxml").text item['info'] = BeautifulSoup( response.xpath('//div[@id="mw-content-text"]/*/p[1]'). extract_first(), "lxml").text[:255] + "..." #print("Links: ", list(map(lambda link: link.url, # self.link_extractor.extract_links(response)))) page_urls = set( map(lambda link: link.url, self.link_extractor.extract_links(response))) page_urls.discard(response.url) item['out_urls'] = page_urls return item
def createItem(self, response): item = WikiItem() # init Fields for correct sort item['uid'] = "" # URL from crawled Site (used for generatedUID -> elastic) m = re.search('(http[s]?:\/\/)?([^\/\s]+)(.*)', response.url) if m: relativeUrl = m.group(3) item['url'] = "https://de.wikipedia.org" + relativeUrl else: item['url'] = "https://de.wikipedia.org" + url responseSelector = Selector(response) # Plugin for easy HTML parsing soup = BeautifulSoup(responseSelector.extract(), 'html.parser') item['pageTitle'] = soup.find('title').text item['text'] = "" for p_tag in soup.findAll('p'): item['text'] = item['text'] + p_tag.text.replace( "\t", " ").replace("\r", " ").replace("\n", " ").replace( " ", " ").strip() # HTML Content of parsed Component item['html'] = responseSelector.extract() # Generated UID which is used as UID for Elastic, so every Item is Unique item['uid'] = self.generateUID(item, 'utf-8') return item
def parse(self, response): # Find all the table rows #response object passes all the website info to the script # rows = response.xpath('//*[@id="mw-content-text"]/div/table/thead/tbody/tr')#[1:] # rows = response.xpath('//*[@id="mw-content-text"]/div/table/tbody/tr')[1:] rows = response.xpath('//*[@id="constituents"]/tbody/tr')[1:] for row in rows: # Relative xpath for all the other columns symbol = row.xpath('./td[1]/a/text()').extract_first() name = row.xpath('./td[2]/a/text()').extract_first() sector = row.xpath('./td[4]/text()').extract_first() sub_industry = row.xpath('./td[5]/text()').extract_first() hq = row.xpath('./td[6]/a/text()').extract_first() dt_add = row.xpath('./td[7]/text()').extract_first() founded = row.xpath('./td[9]/text()').extract_first().strip() item = WikiItem() item['symbol'] = symbol item['name'] = name item['sector'] = sector item['sub_industry'] = sub_industry item['hq'] = hq item['dt_add'] = dt_add item['founded'] = founded yield item
def parse(self, response): # Find all the table rows rows = response.xpath( '//*[@id="mw-content-text"]/div/table/tbody/tr')[1:] # The movie title could be of different styles so we need to provide all the possibilities. for row in rows: film = ''.join(row.xpath('./td[1]//text()').extract()) # film = row.xpath('./td[1]//text()').extract_first() # Relative xpath for all the other columns year = int(row.xpath('./td[2]/a/text()').extract_first()) awards = row.xpath('./td[3]/text()').extract_first() nominations = row.xpath('./td[4]/text()').extract_first().strip() is_bestpicture = bool(row.xpath('./@style')) # Initialize a new WikiItem instance for each movie. item = WikiItem() item['film'] = film item['year'] = year item['awards'] = awards item['nominations'] = nominations item['is_bestpicture'] = is_bestpicture yield item
def parse_wiki(self, response): def hana_upload(cursor, data): url = data[0].replace("'", "''") title = data[1].replace("'", "''") text = data[2].replace("'", "''") sql = f"insert into \"SYSTEM\".\"WIKI\" (TITLE, TEXT, URL) VALUES ('{title}','{text}','{url}')" cursor.execute(sql) def text_cleaner(value): value = ' '.join(value) value = value.replace('\n', '') value = unicodedata.normalize("NFKD", value) value = re.sub(r' , ', ', ', value) value = re.sub(r' \( ', ' (', value) value = re.sub(r' \) ', ') ', value) value = re.sub(r' \)', ') ', value) value = re.sub(r'\[\d.*\]', ' ', value) value = re.sub(r' +', ' ', value) return value.strip() print(f'Found a page: {response.url}') item = WikiItem() body = BeautifulSoup(response.body) item['url'] = response.url item['title'] = body.find("h1", {"id": "firstHeading"}).string # get the first paragraph strings = [] try: for node in response.xpath('//*[@id="mw-content-text"]/div/p'): text = text_cleaner(node.xpath('string()').extract()) if len(text): strings.append(text) except Exception as error: strings.append(str(error)) item['text'] = ' '.join(strings) data = [item['url'], item['title'], item['text']] # don't upload empty or broken data if data[0] is not None and data[1] is not None and data[2] is not None: hana_upload(cursor, data) # print(data[2]) # load new pages base_url = self.base_url if response.url.startswith(base_url): links = response.xpath("//a/@href").extract() regex = re.compile(r'^/wiki/.*') selected_links = list(filter(regex.search, links)) for link in selected_links: if ':' not in link: # print(link) absolute_next_page_url = base_url + link yield Request(absolute_next_page_url) return item
def matchCuisine(self, cuisine, item): countBuffer = 0 for word in map(str.lower, self.cuisineNamesList): cuisine_lower = cuisine.lower() if word in cuisine_lower: return WikiItem(cuisine=word, foodItem=item) countBuffer += 1 if countBuffer >= self.cuisineNameCount - 1: print '###### Not a predefined cuisine:' + cuisine_lower + ':'
def parse_wikipedia_page(self, response): print '->', response.url item = WikiItem() soup = BeautifulSoup(response.body) item['url'] = response.url item['title'] = soup.find('h1', {'id':'firstHeading'}).string item['desc'] = soup.find('div', {'id':'mw-content-text'}).find('p') links= set() for link in soup.find('div', {'id':'mw-content-text'}).find('p').findAll('a', attrs={'href': re.compile('^/wiki/')}): links.add('http://en.wikipedia.org{}'.format(link.get('href'))) arr = [] for l in links: arr.append(l) item['links'] = arr return item
def parse_wiki_page(self, response): #print("Parsed: ", response.url) item = WikiItem() item['url'] = response.url item['title'] = BeautifulSoup( response.xpath('//h1[@id="firstHeading"]').extract_first(), "lxml").text item['info'] = BeautifulSoup( response.xpath( '//div[@id="mw-content-text"]/*/p[1]').extract_first(), "lxml").text item['index'] = self.wiki_url_dict[response.url] return item
def parse(self, response): logger.debug("Parsing: {}".format(response.url)) # ['url', 'text', 'fragment', 'nofollow'] outlinks = [ response.urljoin(link.url) for link in self.link_extractor.extract_links(response)[:100] ] logger.debug("Outlinks count ({}): {}".format(response.url, len(outlinks))) item = WikiItem() item['url'] = response.url item['title'] = response.xpath(self.title_xpath).extract_first() item['snippet'] = response.xpath( self.snippet_xpath).extract_first()[:255] + "..." item['outlinks'] = outlinks yield item for link in outlinks: yield scrapy.Request(link, callback=self.parse)
def save(self, response): jsonresponse = json.loads(response.body_as_unicode()) print(jsonresponse['query']['pages']) word = response.meta['word'] pages = jsonresponse['query']['pages'] for page in pages: item = WikiItem() item['page_id'] = page item['keyword'] = word try: item['title'] = pages[page]['title'] except: print("NO TITLE") item['title'] = '' try: item['content'] = pages[page]['extract'] except: print("NO CONTENT") item['content'] = '' yield item
def parse(self, response): pagina=response.url.split('/')[-1] carpeta="pags_html" if not os.path.exists(carpeta): os.makedirs(carpeta) nombre_archivo=carpeta+'/pag_%s.html'%pagina.replace(':',('_')) with open (nombre_archivo,'wb')as f: f.write(response.body) self.log('Archivo %s guardado' %nombre_archivo) page_info= WikiItem() page_info['url']=response.url page_info['ranking']=1 page_info['palabras']="" page_info['enlaces']=response.css('a[href^="http"] ').xpath('@href').extract() page_info['ruta']=nombre_archivo for url in page_info['enlaces']: if self.cont<30: self.cont=self.cont+1 print ('Procesando pagina numero '+str(self.cont)) yield scrapy.Request(url=url,callback=self.parse) yield page_info pass
def parse_wiki(self, response): def hana_upload(cursor, data): cat = data[0].replace("'", "''") title = data[1].replace("'", "''") url = data[2].replace("'", "''") text = data[3].replace("'", "''") sql = f"insert into \"SYSTEM\".\"WIKI\" (CAT, TITLE, URL, TEXT) VALUES ('{cat}','{title}','{url}','{text}')" cursor.execute(sql) def text_cleaner(value): value = ' '.join(value) value = value.replace('\n', '') value = unicodedata.normalize("NFKD", value) value = re.sub(r' , ', ', ', value) value = re.sub(r' \( ', ' (', value) value = re.sub(r' \) ', ') ', value) value = re.sub(r' \)', ') ', value) value = re.sub(r'\[\d.*\]', ' ', value) value = re.sub(r' +', ' ', value) return value.strip() print(f'Found a page: {response.url}') base_url = self.base_url category_url = self.category_url article_url = self.article_url # if category then crawl more pages if response.url.startswith(category_url): print(f'Crawl category: {response.url}') # all links # links = response.xpath("//a/@href").extract() # category links links = response.xpath( "//div[@class='mw-category-generated']//a/@href").extract() regex = re.compile(r'^/wiki/.*') selected_links = list(filter(regex.search, links)) for link in selected_links: absolute_next_page_url = base_url + link # print(absolute_next_page_url) yield Request(absolute_next_page_url) # elif articale then fetch page elif response.url.startswith(article_url): print(f'Crawl article: {response.url}') item = WikiItem() body = BeautifulSoup(response.body) item['cat'] = self.category item['url'] = response.url item['title'] = body.find("h1", {"id": "firstHeading"}).string # get the first paragraph strings = [] try: for node in response.xpath('//*[@id="mw-content-text"]/div/p'): text = text_cleaner(node.xpath('string()').extract()) if len(text): strings.append(text) except Exception as error: strings.append(str(error)) item['text'] = ' '.join(strings) data = [item['cat'], item['title'], item['url'], item['text']] # don't upload empty or broken data if not None in data: global cursor hana_upload(cursor, data) print(f' -> Upload: {data[0]} > {data[1]} > {data[1]}') # return for scrapy yield item # else don't do anything else: print(f'Page is useless: {response.url}') pass
def createItem(self, response): item = WikiItem() soup = BeautifulSoup(response.extract(), 'html.parser') item['title'] = soup.h1.text item['content'] = soup.get_text() return item
def parse(self, response): item = WikiItem() title = response.xpath( '//h1[@id="firstHeading"]/text()').extract_first() item['title'] = title item['url'] = response.url # tr_list = response.xpath('//table[@class="infobox vcard"]/tr') tr_list = response.css('.infobox tr') image = tr_list.xpath('//a[@class="image"]/img/@src').extract_first() if image is not None: item['image'] = "https:" + image r_part = re.compile(r'\[\d.\]|\[\d\]') # 右侧的info_box表格 info_box = [] for tr in tr_list: th = tr.xpath('./th[@scope="row"]//text()').extract_first() if th is not None: td = re.sub(r_part, "", "".join(tr.xpath('./td//text()').extract())) info_box.append({'key': th, 'value': stripTagSimple(td)}) print(info_box) # print(title) pic = [] thumb_tright = response.xpath( '//div[@class="thumb tright"]/div[@class="thumbinner"]') for p in thumb_tright: if p.xpath('./a/img/@src').extract_first() is not None: img = 'https:' + p.xpath('./a/img/@src').extract_first() img_desc = re.sub( r_part, "", "".join( p.xpath( './div[@class="thumbcaption"]//text()').extract())) pic.append({'url': img, 'img_desc': stripTagSimple(img_desc)}) # print(pic) item['pic'] = pic html_content = response.xpath( '//div[@id="mw-content-text"]').extract_first() soup = BeautifulSoup(html_content, 'html.parser') # 销毁目录节点 catalog = soup.find('div', class_="toc") if catalog is not None: soup.find('div', class_="toc").decompose() # 销毁参考资料节点 ref = soup.find('ol', class_="references") if ref is not None: soup.find('ol', class_="references").decompose() # ps是文中所有的段落 div = soup.find(name='div', class_='mw-parser-output') ps = div.find_all('p', recursive=False) # only direct children index = 0 for p in ps: if p.get_text() == '': break index += 1 summary = {} s_index = 0 while s_index < index: summary[f'{s_index}'] = stripTagSimple(ps[s_index].get_text()) s_index += 1 print(summary) start = re.compile(r'<p>', re.DOTALL) search_result = start.search(soup.decode('utf-8')) if search_result is None: search_result = re.compile(r'<h2>', re.DOTALL).search(soup.decode('utf-8')) content_text = collections.OrderedDict() if search_result is not None: start_node = soup.decode('utf-8')[search_result.start():] lists = start_node.split('<h2>') i = 1 while i < len(lists): lists[i] = '<h2>' + lists[i] final_soup = BeautifulSoup(lists[i], 'html.parser') para_title = final_soup.find( 'span', class_="mw-headline").get_text().strip() if para_title == "外部链接" or "参考" in para_title: i += 1 continue para_contents = final_soup.find_all(['p', 'li', 'table']) texts = [] for para in para_contents: if para.name == 'table': texts.append(para.prettify()) continue texts.append(stripTagSimple(para.get_text('', True))) content_text[para_title.replace('.', '点')] = texts i += 1 catlinks = response.xpath( '//div[@class="catlinks"]/div[@id="mw-normal-catlinks"]//li') tag = {} j = 0 for link in catlinks: href = 'https://zh.wikipedia.org' + link.xpath( './a/@href').extract_first() cat = link.xpath('./a/text()').extract_first() tag[f'{j}'] = cat j += 1 detail = { 'title': title, 'summary': summary, 'infobox': info_box, 'content': content_text, 'category': tag, } item['detail'] = detail now_time = datetime.datetime.fromtimestamp(time.time()) item['updateAt'] = now_time return item
def parse(self, response): # if self.count > 10: # return # else: # self.count += 1 json = response.json() json_pages = json['query']['pages'] # extracts 信息分段发送(冗余) if 'continue' in json: print_debug_info('Error:', 'extracts need continue') # page_id 为json动态属性,需要用for循环获取 for page_id, page_content in json_pages.items(): # page_id 为"-1"则词条不存在(冗余) if page_id != '-1': title = json_pages[page_id]['title'] wiki = wikiapi.Wikipedia('zh', json, page_id) page = wiki.page(title) # 只考虑 namespace 为 Main/Article的词条,namespace参见"https://en.wikipedia.org/wiki/Wikipedia:Namespace"(冗余) if page.namespace == 0: # 该名字为"别名",需要额外一次请求获取"真名"(冗余) if page_content['extract'] == '': real_title = page.displaytitle print_debug_info('Warning:', 'extracts is empty') yield scrapy.Request(get_url_by_name(real_title)) # 该词条是一个完整的实体,可以被保存(正常情况仅有该代码段会执行) else: print_debug_info('Success:', page.namespace, page_id, title, response) linked_items = [] # 将被该词条链接的词条加入爬取队列,此处需要至少一个请求 query = link_query(page_id) link_dict = query['link_dict'] redirect_dict = query['redirect_dict'] for link_title in link_dict: link_page_id = link_dict[link_title] linked_items.append({ 'page_id': link_page_id, 'title': link_title }) yield scrapy.Request( get_url_by_page_id(link_page_id)) sections = { 'title': 'summary', 'text': page.summary, 'linked_words': make_linked_words(page.summary, link_dict, redirect_dict), 'sections': make_sections(page.sections, link_dict, redirect_dict) } yield WikiItem(page_id=int(page_id), title=title, sections=sections, linked_items=linked_items) # namespace 不为0(冗余) else: print_debug_info('Warning:', 'ns is not 0') # page_id 为 -1,即该词条不存在,且无重定向链接(冗余) else: print_debug_info('Warning', 'page_id is -1')