def parse(self, response): dirname = os.sep.join(['root'] + response.url.split('/')[2:-1]) filename = os.sep.join([dirname, response.url.split('/')[-1]]) article_text = Selector(response).xpath( '//div[@class="post"]').extract()[0] parser = Selector(text=article_text) article_title = parser.xpath( '//a[@id="cb_post_title_url"]/text()').extract()[0] title_link = parser.xpath( '//a[@id="cb_post_title_url"]/@href').extract()[0] article_text = article_text.replace(title_link, title_link[6:]) item = ArticleItem() item['image_urls'] = [x for x in parser.xpath('//img/@src').extract()] item['image_names'] = [x.split('/')[-1] for x in item['image_urls']] # process image links. for url in item['image_urls']: article_text = article_text.replace(url, url[6:]) if not os.path.exists(dirname): os.makedirs(dirname) with open(filename, 'wb') as fp: fp.write(self.html_start_l + article_title.encode('utf-8') + self.html_start_r + article_text.encode('utf-8', 'ignore') + self.html_end) return item
def parse(self, response): dirname = os.sep.join(['root'] + response.url.split('/')[2:-1]) filename = os.sep.join([dirname, response.url.split('/')[-1]]) article_text = Selector(response).xpath('//div[@class="post"]').extract()[0] parser = Selector(text = article_text) article_title = parser.xpath('//a[@id="cb_post_title_url"]/text()').extract()[0] title_link = parser.xpath('//a[@id="cb_post_title_url"]/@href').extract()[0] article_text = article_text.replace(title_link, title_link[6:]) item = ArticleItem() item['image_urls'] = [x for x in parser.xpath('//img/@src').extract()] item['image_names'] = [x.split('/')[-1] for x in item['image_urls']] # process image links. for url in item['image_urls']: article_text = article_text.replace(url, url[6:]) if not os.path.exists(dirname): os.makedirs(dirname) with open(filename, 'wb') as fp: fp.write(self.html_start_l + article_title.encode('utf-8') + self.html_start_r + article_text.encode('utf-8', 'ignore') + self.html_end) return item
def parse(self, response): print('################################') # print(response.url.split('=', 3)[1]) # type type = '招标公告' if response.url.split('=', 3)[1] == 'W004_001&page': type = '中标公告' next_page = response.xpath( "/html/body/div[@class='cover']/div[@class='main']/div[@class='main-advert']/div[@class='main-cont']/div[@class='list_right']/div[@class='pager']/span/font/text()" ).extract()[0] print('################################page===' + next_page + '===Page################################') next_page = str(int(next_page) + 1) total_page = response.xpath( "/html/body/div[@class='cover']/div[@class='main']/div[@class='main-advert']/div[@class='main-cont']/div[@class='list_right']/div[@class='pager']/span/font/text()" ).extract()[1] next_url = 'http://www.tjgpc.gov.cn/webInfo/getWebInfoListForwebInfoClass.do?fkWebInfoclassId=W004_001&page=' + next_page + '&pagesize=10' # print(total_page) # print(next_url) for tr in response.xpath( "/html/body/div[@class='cover']/div[@class='main']/div[@class='main-advert']/div[@class='main-cont']/div[@class='list_right']/div[@class='cur']/table/tr" ).extract(): title = Selector( text=tr).xpath('//td[2]/a[@class]/text()').extract()[0] category = Selector( text=tr).xpath('//td[2]/a[1]/text()').extract()[0] category = category.replace('[', '') category = category.replace(']', '') url = Selector(text=tr).xpath('//td[2]/a[2]/@href').extract()[0] # print(title) # print(tr) # print(category) # print(url) title = title.replace('成交结果公告', '') if len(title) > 3: temp = title[0:4] # print(temp) if (temp == '天津市'): title = title.replace(temp, '') issue_at = Selector(text=tr).xpath('//td[3]/text()').extract()[0] issue_at = issue_at.replace('[', '') issue_at = issue_at.replace(']', '') # print('######'+'######'+title+'######'+category+'######'+url+'######'+time+'######') yield scrapy.Request(url, callback=self.parse_item, meta={ "title": title, "type": type, "url": url, "issue_at": issue_at, "category": category }) if (int(next_page) < int(total_page)): yield scrapy.Request(next_url, callback=self.parse, dont_filter=True)
def parse(self, response): # get all the listing blocks listings = response.xpath('//a[@class="col-xs-12 profitem"]').getall() # within each listing block get the details for i in listings: # there is more than 1 heading or suburb, just get the first one suburb = Selector(text=i).xpath( '//h4[@class="mat-header"]/text()').get().strip() # new or updated listing status = Selector(text=i).xpath( '//span[@class="mat-text-span text-uppercase mat-new hidden-xs"]/text()' ).get() # price price = Selector( text=i).xpath('//h4[@class="mat-header mat-price"]').get() # some regex to extract the price loc = re.search("</sup>", price) price = price[loc.span()[1]:] price = price.replace('<sup>', '') price = price.replace('</sup>', '') price = price.replace('</h4>', '') price = re.sub('\xa0', ' ', price) price = price.strip() # get all feature details in a list details = Selector(text=i).xpath( '//ul[@class="mat-feture"]/li/div[@class="mat-fetaure-avl"]/text()' ).getall() # listing details home_type = details[0].strip() available = details[1].strip() occupants = details[2].strip() # get description desc = Selector(text=i).xpath( '//div[@class="col-sm-4 col-md-6 hidden-xs hidden-sm mathes-list"]/p/text()' ).get().strip() desc = desc.replace('\r', '') desc = desc.replace('\n', '') listing = { 'suburb': suburb, 'status': status, 'price': price, 'home_type': home_type, 'available': available, 'occupants': occupants, 'description': desc, } yield (listing)
def parse(self, response): dirname = os.sep.join(['root'] + response.url.split('/')[2:-1]) filename = os.sep.join( [dirname, response.url.split('/')[-1] + '.html']) # parse artitle text. article_text = Selector(response).xpath( '//div[@id="article_details"]').extract()[0] parser = Selector(text=article_text) # parse artile title. article_title = parser.xpath( '//span[@class="link_title"]/a/text()').extract()[0] article_links = parser.xpath( '//a[re:test(@href, "[^/]+/article/details/\d+")]/@href').extract( ) # replace links. article_text = article_text.replace( 'http://static.blog.csdn.net/css/blog_detail.css', '/static.blog.csdn.net/css/blog_detail.css') for link in article_links: article_text = article_text.replace( link, '/blog.csdn.net' + link + '.html') item = ArticleItem() item['image_urls'] = [x for x in parser.xpath('//img/@src').extract()] # handle such image(with watermark) url: # http://img.blog.csdn.net/20140917165912117?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvaWFpdGk=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast item['image_names'] = [ (lambda k: k if '?' not in k else k.split('?')[0] + '.png')(x).split('/')[-1] for x in item['image_urls'] ] # process image links. for url in item['image_urls']: article_text = article_text.replace( url, (lambda k: k if '?' not in k else k.split('?')[0] + '.png')(url)[6:]) if not os.path.exists(dirname): os.makedirs(dirname) with open(filename, 'wb') as fp: fp.write(self.html_start_l + article_title.encode('utf-8') + self.html_start_r + article_text.encode('utf-8', 'ignore') + self.html_end) return item
def parse(self,response): item = Aiuw_Item() item['site'] = response.url title = Selector(response).xpath("//span[@id='imgExplain']/text()").extract() if len(title) > 0 : item['title'] = title[0] else : return tags = Selector(response).xpath("//div[@class='tag']/a/text()").extract() if len(tags) > 0 : item['tag'] = ','.join(tags) else : item['tag'] = '' url = Selector(response).xpath("//div[@class='img_boxlist up userSelectNone']/img/@src").extract()[0] item['origin_url'] = url.replace("zip@q80", "zip@w400") item['new_url']='' item['mb']='' item['pixel']='' item['format']=url.split("/")[-1].split('.')[-1] return item;
def parse(self, response): file_name = self.PATH + response.url.split( '/' )[-1] if 'index' not in response.url else self.PATH + response.url.split( '/')[-2] + '.shtml' with open(file_name, 'wb') as (writer): writer.write(response.body) html = response.xpath('//div[@id="contentELMT1368521805488378"]').get() lis = None if html is None: lis = response.xpath('//li/a').getall() else: lis = Selector(text=html).xpath('//li/a').getall() for li in [lis[0]]: try: url = Selector(text=li).xpath('//a[@href]').attrib['href'] for url in [url]: url = 'http://www.cctv.com' + url if url[0] == '/' else url url = url.replace('news.cntv.cn', 'tv.cctv.com') yield SplashRequest(url, callback=(self.parse_sumary), endpoint='render.html', args={ 'wait': 2, 'http_method': 'GET' }, headers=(self.headers)) except Exception: continue for li in lis[1:]: try: url = Selector(text=li).xpath('//a[@href]').attrib['href'] for url in [url]: url = 'http://www.cctv.com' + url if url[0] == '/' else url url = url.replace('news.cntv.cn', 'tv.cctv.com') yield SplashRequest(url, callback=(self.parse_detail), endpoint='render.html', args={ 'wait': 2, 'http_method': 'GET' }, headers=(self.headers)) except Exception as e: continue
def pmid_to_citation(pmid): ''' Use pmid to find citation string ''' url = 'https://www.ncbi.nlm.nih.gov/sites/PubmedCitation?id=' + pmid body = requests.get(url, timeout=5).text citation = Selector(text=body).xpath('string(/)').get() return citation.replace(u'\xa0', u' ')
def video_parse(self, response): item = VideoItem() user_id = response.meta['user_id'] text = json.loads(response.text) html = text['data'].get("html", "") video_list = Selector(text=str(html)).xpath("//a/figure").extract() for v in video_list: data_date = Selector(text=v).xpath("//@data-date").extract()[0] data_url = Selector(text=v).xpath("//@data-url").extract()[0] data_title = Selector(text=v).xpath("//@data-title").extract()[0] watch_number = Selector(text=v).xpath("//p[@class='crumbs']//span[@class='nums']/text()").extract()[0] barrage_number = Selector(text=v).xpath("//p[@class='crumbs']//span[@class='nums']/text()").extract()[1] item['user_id'] = int(user_id) item['title'] = data_title item['watch_number'] = int(watch_number.replace(".", "").replace("万", "")) item['barrage_number'] = int(barrage_number.replace(".", "").replace("万", "")) item['time'] = datetime.datetime.strptime(data_date, "%Y/%m/%d") item['data_url'] = data_url yield item
def parse_list(self , response): print('################################Page==='+re.search("\d+",response.url).group(0)+'===Page################################') for li in response.xpath("/html/body/div[@class='con_row']/div[@class='list_right f_l']/div[@class='search_list_con gg_list']/ul/li").extract(): title = Selector(text=li).xpath('//a/text()').extract()[0] issue_at = Selector(text=li).xpath("//span[@class='search_list_time']/text()").extract()[0] url = Selector(text=li).xpath('//a/@href').extract()[0] url = 'http://www.hebeieb.com'+url category = Selector(text=li).xpath("//div[@class='search_list_biaoqian']/span[1]/text()").extract()[0] category = category.replace('行业:','') city = Selector(text=li).xpath("//div[@class='search_list_biaoqian']/span[2]/text()").extract()[0] city = city.replace('地区:','') type = '招标公告' # print(title) # print(issue_at) # print(url) # print(category) # print(city) yield scrapy.Request(url, callback=self.parse_item,meta={"title":title,"type":type,"url":url,"issue_at":issue_at,"city":city,"category":category})
def article_content(self, response): item = response.meta['item'] title = Selector(text=response.body).xpath('//h2[@id="activity-name"]/text()').extract()[0].encode( 'utf-8').strip() content = Selector(text=response.body).xpath('//*[@id="js_article"]/div[@class="rich_media_inner"]').extract()[ 0].encode('utf-8') content = content.replace('\r\n', '').strip() item['title'] = title item['content'] = content return item # 返回item,执行数据库操作
def parse(self, response): for li in response.xpath("/html/body/form[@id='jyform']/div[@class='clearfix']/div[@id='jytypetext']/div[@class='clearfix isshowdisplay']/div[@class='l']/div[@class='infor-bd clearfix']/ul[@class='infor-items']/div[@id='jyform:refreshData']/div[@id='jyform:refreshData_content']/table[@class='ui-datagrid-data']/tbody//tr[@class='ui-datagrid-row']/td[@class='ui-datagrid-column']/li[@class='notice-item infor-item clearfix']").extract(): # print(li) title = Selector(text=li).xpath("//div[@class='notice-block l']/a/text()").extract()[0] # \n\t\t title = title.replace('\n','') title = title.replace('\t','') type = '招标公告' city = Selector(text=li).xpath("//span[@class='infro-span'][1]/text()").extract()[0] city = city.replace('【','') city = city.replace('】','') issue_at = Selector(text=li).xpath("//span[@class='notice-date ']/text()").extract()[0] url = Selector(text=li).xpath("//div[@class='notice-block l']/a/@href").extract()[0] url = 'http://www.zjpubservice.com'+url # print(str(title)) # print(city) # print(issue_at) print(url)
def parse(self, response): xxs = Selector(response) pois = xxs.xpath('//poi').extract() for poi in pois: state = Selector(text=poi).xpath('//state/text()').get() if state == None: state = Selector(text=poi).xpath('//province/text()').get() addr = Selector(text=poi).xpath('//address1/text()').get() if addr == None: addr = Selector(text=poi).xpath('//address2/text()').get() if addr == None: addr = Selector(text=poi).xpath('//dsply_adr/text()').get() name = Selector(text=poi).xpath('//name/text()').get() name = name.replace('<br>', '') name = name.replace('®', ' ') name = name.replace(';', '') name = name.replace(' ', ' ') properties = { 'ref': Selector(text=poi).xpath('//clientkey/text()').get(), 'name': name, 'addr_full': addr, 'city': Selector(text=poi).xpath('//city/text()').get(), 'state': state, 'postcode': Selector(text=poi).xpath('//postalcode/text()').get(), 'country': Selector(text=poi).xpath('//country/text()').get(), 'lat': Selector(text=poi).xpath('//latitude/text()').get(), 'lon': Selector(text=poi).xpath('//longitude/text()').get(), 'phone': Selector(text=poi).xpath('//phone/text()').get(), 'extras': { 'brand': "Timberland" } } yield GeojsonPointItem(**properties)
def Vacancy_info(url): url = url.replace("/en/", "/ge/") print(url) page = requests.get(url) # Description try: description = Selector(response=page).xpath('//*[@id="job"]/table/tr[1]/td/table[2]').get() description = remove_tags(description) description = description.rstrip() description = description.lstrip() description = description.replace('*', "") description = re.sub(r"\s+", " ", description) print(description) except: description = "" if detect(description) == "ru": description_ru = description description_en = Translate(description) description_ka = "" elif detect(description) == "et": description_ru = "" try: description_en = Translate(description) except: description_en = "" description_ka = description else: description_ru = "" description_en = description description_ka = "" # Email try: email = re.findall(r'[\w\.-]+@[\w\.-]+', description) email = email[0] except: email = "" data = { "description_ka" : description_ka, "description_ru" : description_ru, "description_en" : description_en, "email" : email } return data # Vacancy_info("https://jobs.ge/en/?view=jobs&id=268715")
def get_reviews(self, item): url = Selector(text=item).xpath('.//a/@href').extract() res = '' if len(url): url = url[0] url = url.replace('../../','http://www.comparometer.in/') #res = requests.get(url) #data = Selector(text=res.text).xpath('//div[@class="col-sm-12"]/div[@class="col s4 reviewrating"]/img/@src').extract() #data2 = Selector(text=res.text).xpath('//div[@class="col-sm-12"]/div[@class="col s4 reviewrating"]/span/text()').extract() #data3 = Selector(text=res.text).xpath('//div[@class="col-sm-12"]/div[@class="col s4 reviewrating"]/a/@href').extract() #addup = list(zip(data,data2,data3)) #review = " ".join(str(x) for x in addup) else: review = "NA" return url
def parse_name_and_birth(name_and_birth): #<h4>강기윤</h4> # <ul> # <li class="photo"> # <img src="/photo/9770703.jpg" alt="강기윤 의원사진" /> # </li> # <li>姜起潤</li> # <li>KANG Gi Yun</li> # <li>1960-06-04</li> # </ul> profile = get_xpath_data(page,".//*/div[@class='profile']") name_kr = get_xpath_data(profile, ".//*/h4/text()") name_cn = Selector(text=profile).xpath('.//*/li/text()')[2].extract() name_en = Selector(text=profile).xpath('.//*/li/text()')[3].extract() birth = Selector(text=profile).xpath('.//*/li/text()')[4].extract() return [name_kr, name_cn, name_en, birth.replace('.','-')]
def parse(self, response): rows = response.xpath('//div[@class="m_content"]/ul/li[not(@class)]').extract() for isi in rows: link_page = Selector(text=isi).xpath('//div[@class="desc_nhl"]/a/@href').extract_first() clean_date = Selector(text=isi).xpath('//div[@class="desc_nhl"]/span[@class="labdate f11"]/text()').extract_first() clean_date = clean_date.replace("DETIKNEWS | ", "") item = { 'headline' : Selector(text=isi).xpath('//article/div[@class="desc_nhl"]/a[@data-category="WP Kanal Jawatimur"]/h2/text()').extract_first(), 'main_headline' : Selector(text=isi).xpath('//div[@class="desc_nhl"]/text()[4]').extract_first().strip(), 'date' : clean_date, 'url' : link_page } request = scrapy.Request(link_page, callback=self.parse_page2) request.meta['item'] = item yield request
def parseWaterBill(self, response): #Check if we found the water bill if not then write to the failed CSV and return. if (len( response.xpath( "//span[@id='ctl00_ctl00_rootMasterContent_LocalContentPlaceHolder_lblCurrentBalance']" )) == 0): print("Couldn't find a water bill for account " + response.meta['account_or_address']) self.writeFailedCSV(response.meta['account_or_address']) return None #I use the item feature in scrapy to store the items. wateritem = WaterbillItem() wateritem['Searched_Address'] = response.meta[ 'search_type'] #This is a relic of when I searched by addresses. table = response.xpath('//table[@class="dataTable"]//tr') headers = [ 'Account Number', 'Service Address', 'Current Read Date', 'Current Bill Date', 'Penalty Date', 'Current Bill Amount', 'Previous Balance', 'Current Balance', 'Previous Read Date', 'Last Pay Date', 'Last Pay Amount', 'TimeStamp' ] #I can't determine if this actually works because I can't find an address with a shut off notice. if (len( response.xpath( "//span[@id='ctl00_ctl00_rootMasterContent_LocalContentPlaceHolder_lblTurnOffDate']" )) != 0): wateritem['TurnOffDate'] = "Yes" #wateritem['TurnOffDate'] = Selector(text=row.extract()).xpath("//span[@id='ctl00_ctl00_rootMasterContent_LocalContentPlaceHolder_lblTurnOffDate']").extract_first() else: wateritem['TurnOffDate'] = 'No' for row in table: header = Selector( text=row.extract()).xpath('//th/text()').extract_first() value = Selector(text=row.extract()).xpath( '//td/descendant::*/text()').extract_first() if value == None: value = '' #So it populates the excel sheet with a blank spot if (header != None and header.strip().replace(':', "") in headers): value = value.replace('$', '').replace(",", '') if ("Date" in header and value != ''): #Convert to SQL Datetime Format value = datetime.strptime(value.strip(), '%m/%d/%Y').strftime('%Y-%m-%d') wateritem[header.strip().replace(':', "").replace( ' ', '_')] = value.strip() wateritem['Timestamp'] = datetime.today().strftime('%Y-%m-%d') return wateritem
def parse_item(self, response): # body = response.body page = response.url.split("/")[-2] # sel = Selector(response) # hxs = HtmlXPathSelector(response) list_it = response.xpath( '//li[contains(@class,"review-item")]').extract() list_item = [] for it in list_it: # item = FoodyItem() # content = response.xpath('//div[contains(@class,"review-des")]/div[contains(@class,"rd-des")]/span/text()').extract_first() content = Selector(text=it).xpath( '//div[contains(@class,"review-des")]/div[contains(@class,"rd-des")]/span/text()' ).extract_first() if content != None: content = content.replace("\n", " ") # point =response.xpath('//div[contains(@class,"review-des")]//div[contains(@class,"review-points")]/span/text()').extract_first() point = Selector(text=it).xpath( '//div[contains(@class,"review-des")]//div[contains(@class,"review-points")]/span/text()' ).extract_first() # print("############################################################") # # print(it) # print(content) # print("############################################################") # print(point) # list_item.append(point + content) if point != None and content != None: # list_item.append(point+ "\t" + content) list_item.append(point + "\t" + content) # yield self.parse_detail_item(it) yield scrapy.Request(it, self.parse_detail_item) # yield self.parse_detail_item(it) # yield SplashRequest(it, self.parse_item, endpoint='execute', # args={'lua_source': script_per_page}) for a in list_item: print("###############################") print(a) with open("data2/{}.txt".format(page), "w", encoding='utf-8') as file: for i in list_item: file.write(i + "\n")
def parse(self, response): # print('-----------------------------------------------') # print(response.url.split('=', 1)[1]) # type type = '招标公告' if response.url.split('=', 1)[1]=='67': type = '中标公告' # print(type) category = response.xpath("/html/body/table[1]/tr/td[2]/table/tr[1]/td[@class='c_pt']/table/tr[1]/td[3]/span[@class='zt1']/text()").extract()[0] category = category.split('--', 3)[3] # print(category) currentPage = response.xpath("/html/body/table[1]/tr/td[2]/table/tr[1]/td[@class='c_pt']/table/tr[2]/td/div[@class='zt3']/div[@class='pager']/strong/font/text()").extract()[0] nextPage = int(currentPage)+1 # print(currentPage) # print(nextPage) print('Nav'+response.url.split('=', 1)[1]+'###############################Page==='+currentPage+'===Page################################') totalPage = response.xpath("/html/body/table[1]/tr/td[2]/table/tr[1]/td[@class='c_pt']/table/tr[2]/td/div[@class='zt3']/div[@class='pager']/a[text()='最后一页 »']/@href").extract()[0] totalPage = totalPage.split('=', 2)[2] # print('total---'+totalPage) # /html/body/table[1]/tbody/tr/td[2]/table/tbody/tr[1]/td[@class='c_pt']/table/tbody/tr[2]/td/div[@class='zt3']/table[@id='node_list']/tbody/tr[@class='odd'][1]/td[1]/a # print('tr-------------------------------------->') for tr in response.xpath("/html/body/table[1]/tr/td[2]/table/tr[1]/td[@class='c_pt']/table/tr[2]/td/div[@class='zt3']/table[@id='node_list']/tbody/tr").extract(): # print(tr) title = Selector(text=tr).xpath('//td[1]/a/text()').extract()[0] url = Selector(text=tr).xpath('//td[1]/a/@href').extract()[0] url = 'http://www.sxzfcg.cn/'+url issue_at = Selector(text=tr).xpath('//td[2]/text()').extract()[0] issue_at = issue_at.replace('[','') issue_at = issue_at.replace(']','') city = '山西' # print(title) # print(url) # print(issue_at) yield scrapy.Request(url, callback=self.parse_item,meta={"title":title,"type":type,"url":url,"issue_at":issue_at,"city":city,"category":category},dont_filter=True) next_url = response.url.split('=',2)[0]+'='+response.url.split('=',2)[1]+'&page='+str(nextPage) next_url = 'http://www.sxzfcg.cn/view.php?nav=61&page='+str(nextPage)
def parse(self, response): docID = response.url[response.url.find("=") + 1:] docURL = '/content/content?DocID=' + docID if self.collection.find_one({"链接": response.url}) is not None: print "<<<<<<<<<<<Catch Duplicate<<<<<<<<<" return print "----------------------------- Cur -----------------------" self.cur += 1 print response.url print self.cur item = Judge() item['case_name'] = ''.join( Selector(response).xpath( "//input[@id='hidCaseName']/@value").extract()) item['case_num'] = ''.join( Selector(response).xpath( "//input[@id='hidCaseNumber']/@value").extract()) item['url'] = response.url dic_string_unmod = Selector(response).xpath( "//input[@id='hidCaseInfo']/@value").extract()[0] dic_string = ''.join(dic_string_unmod.replace('null', 'None').split()) item['case_info'] = eval(dic_string)['诉讼记录段原文'] item['procedure'] = eval(dic_string)['审判程序'] item['court'] = ''.join( Selector(response).xpath( "//input[@id='hidCourt']/@value").extract()) item['company'] = self.advanced_filter[0] docID = Selector(response).xpath( "//input[@id='hidDocID']/@value").extract() doc_text_url = "http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID=" + ''.join( docID) doc_url = "http://wenshu.court.gov.cn/content/content?DocID=" + ''.join( docID) req = scrapy.Request( doc_text_url, callback=self.parse_doc, dont_filter=True, errback=lambda x: self.download_errback(x, doc_url)) req.meta['foo'] = item yield req
def parse(self, response): rules = Rules().parse block = response.xpath(rules['block']).extract() for b in block: item = LotteryItem() red_number = Selector(text=b).xpath(rules['red_number']).extract() blue_number = Selector(text=b).xpath( rules['blue_number']).extract() phase_number = Selector(text=b).xpath( rules['phase_number']).extract_first() note_number = Selector(text=b).xpath( rules['note_number']).extract_first() bonus = Selector(text=b).xpath(rules['bonus']).extract_first() item['red_number'] = ','.join(map(str, map(int, red_number))) item['blue_number'] = ','.join(map(str, map(int, blue_number))) item['phase_number'] = int(phase_number) item['note_number'] = int(note_number) item['bonus'] = int(float(bonus.replace(',', ''))) yield item
def Vacancy(link): print("request sent for Vacancy succesfully") url = link print(url) # headers = {"Accept-Language": "en-US,en;q=0.5"} page = requests.get(url) #headers=headers) # Published try: published = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/ul/li[2]/span/text()[2]' ).get() published = published.strip().split(" ") publish_day = int(published[0].split("/")[0]) publish_month = int(published[0].split("/")[1]) publish_year = int(published[0].split("/")[2]) except Exception as e: publish_day = 0 publish_month = 0 publish_year = 0 if yesterday_day != publish_day or yesterday_month != publish_month: print("Not published yesterday") return # Location # try: location = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/ul/li[1]/text()' ).get() location = location.strip() location_id = [] location = {"city": f"{location}", "id": f"{Geonames(location)}"} location_id.append(location) except: location_id = [{'city': 'Yerevan', 'id': '616052'}] # Posted by try: posted_by = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/p[1]/text()' ).get() posted_by = posted_by.strip() except: posted_by = "" # Email try: email = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/p[2]/text()' ).get() email = email.strip() if email == "": email = [] else: email = [email] except: email = [] # Workspace try: workspace = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[2]/div[2]/div[2]/p/text()' ).get() workspace = workspace.strip() except: workspace = "" # Job_type try: job_type = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[3]/div[2]/div[2]/p/text()' ).get() job_type = job_type.strip() except: job_type = "" # Salary try: salary = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[4]/div[2]/div[2]/p/text()' ).get() salary = salary.strip().replace("Until ", "") if "-" in salary: salary = salary.split("-") min_salary = int(salary[0].strip()) max_salary = int(salary[1].strip()) elif "-" not in salary and salary != '': min_salary = int(salary) max_salary = int(salary) else: min_salary = 0 max_salary = 0 except: min_salary = 0 max_salary = 0 # Education try: education = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[5]/div[2]/div[2]/p/text()' ).get() education = education.strip() except: education = "" # Experience try: experience = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[6]/div[2]/div[2]/p/text()' ).get() experience = experience.strip() except: experience = "" # Gender try: gender = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[7]/div[2]/div[2]/p/i/@class' ).get() if "female" in gender: gender = "female" elif "male" in gender: gender = "male" else: gender = '' except: gender = "" # Age try: age = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[8]/div[2]/div[2]/p/text()' ).get() age = age.strip() except: age = "" print(1) # Description try: description = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[10]/div[2]/div/p/text()' ).get() description = description.strip() except: description = "" description_en = "" description_am = "" try: if detect(description) == "et": try: description_en = Translate(description) except: description_en = "" description_am = description else: description_en = description description_am = "" except: description_en = "" description_am = "" # Phone try: phone = Selector(response=page).css( '#sidebar-border > div.detailed-info-block.form-inline.clearfix > div.clearfix > div > div.user-details' ).extract() phones = [] for phone in phone: phone = remove_tags(phone).strip() area_code = "374" number = phone.replace(" ", "") number = number.replace("-", "") number = number.replace("(", "") number = number.replace(")", "") phones.append({'country_code': area_code, "number": number}) except: phone = [] # Username try: username = Selector(response=page).xpath( '//*[@id="sidebar-border"]/div[1]/div[1]/div/div[1]/div[2]/div[1]/div[2]/h6/a/text()' ).get() username = username.strip() except: username = "" data = { "publish_day": publish_day, "publish_month": publish_month, "publish_year": publish_year, "location_id": location_id, "posted_by": posted_by, "email": email, "workspace": workspace, "job_type": job_type, "min_salary": min_salary, "max_salary": max_salary, "education": education, "experience": experience, "gender": gender, "age": age, "description_am": description_am, "description_en": description_en, "phone": phones, "username": username } print(data) return data # Vacancy("https://full.am/en/job/public/view/1163") # https://full.am/en/job/public/view/12067 # https://full.am/en/job/public/view/1163
def parse(self, response): detailed_review_object_list = [] review_selector_list = response.xpath( '//div[@id="reviews-container"]//div[@class="js-paginator-data"]' ).xpath('//div[@class="rvw js-rvw"]') for _review_selector in review_selector_list: _current_review_selector_body = _review_selector.get() # _review_rating = _review_selector.xpath('//div[@class="rvw__hdr-stat"]//img/@data-rating').get() # '5.0' _review_rating = Selector( text=_current_review_selector_body).xpath( '//div[@class="rvw__hdr-stat"]//img/@data-rating').get() # _author_info = _review_selector.xpath('//div[@class="rvw-aut__inf"]/strong/text()').get() # 'Julie of Ceres,, CA' _author_info = Selector(text=_current_review_selector_body).xpath( '//div[@class="rvw-aut__inf"]/strong/text()').get() _author_state: str = _author_info.split(',')[-1] # 'CA' # _review_date_text = _review_selector.xpath('//div[@class="rvw-bd ca-txt-bd-2"]/span/text()').get() #'Original review: March 18, 2019' _review_date_text = Selector( text=_current_review_selector_body).xpath( '//div[@class="rvw-bd ca-txt-bd-2"]/span/text()').get( ).split(':')[-1] # Let's remove whitespace to make it easier to convert to datetime object _review_date_text = _review_date_text.replace(' ', '') # _review_date_text = 'March18,2019' _review_date_text = _review_date_text[-4:] # _date_pattern = '%b.%d,%Y' # 'Oct.21,2019' _date_pattern = '%Y' # '2019' _struct_time_format = (time.strptime(_review_date_text, _date_pattern)) _date_time_format = datetime.datetime(*_struct_time_format[:6]) eastern = pytz.timezone('US/Eastern') utc = pytz.utc aware_date_time = eastern.localize(_date_time_format) utc_review_date_time = aware_date_time.astimezone(utc).timestamp() # This will be the list of all paragraphs that we find in a review that we will be using to process. _review_description_paragraph_list: list = Selector( text=_current_review_selector_body).xpath( '//div[@class="rvw-bd ca-txt-bd-2"]/p').getall() _clean_review_description_list: list = [] # Let's check if there is a collapsed div that we need to process. if Selector(text=_current_review_selector_body).xpath( '//div[@class="rvw-bd ca-txt-bd-2"]/div[@class="js-collapsed"]' ).get() is not None: # We need to get all the paragraphs in the collapsed div that we found _collapsed_paragraph_list = Selector( text=_current_review_selector_body ).xpath( '//div[@class="rvw-bd ca-txt-bd-2"]/div[@class="js-collapsed"]/p' ).getall() # Let's add these new paragraphs to our original list for processing _review_description_paragraph_list.extend( _collapsed_paragraph_list) for para in _review_description_paragraph_list: if Selector(text=para).xpath('//p/text()').get( ) is not None: # If the paragraph is not empty _clean_review_description_list.append( Selector(text=para).xpath('//p/text()').get()) _clean_review_description = ''.join(_clean_review_description_list) _num_found_useful_text: str = Selector( text=_current_review_selector_body ).xpath( '//div[@class="rvw-foot"]/span[@class="rvw-foot__helpful-count js-helpful-count ca-txt--clr-gray"]/strong/text()' ).get() # We need to extract the number from the text we get from _num_found_useful_text --> E.g. '97 people' _num_found_useful: str = _num_found_useful_text.split(' ')[0] detailed_review_object = { 'ratings': _review_rating, 'reviewer_location': _author_state, 'review_time_utc': str(utc_review_date_time), 'review_description': _clean_review_description, 'num_found_useful': _num_found_useful } detailed_review_object_list.append(detailed_review_object) _return_data = {'reviews': detailed_review_object_list} return _return_data
# Company try: company = Selector(response=page).xpath( f'//*[@id="affiliations-list"]/tbody/tr[{tr}]/td[1]/a/text()' ).get() company = company.strip() except Exception as e: company = e # Role try: role = Selector(response=page).xpath( f'//*[@id="affiliations-list"]/tbody/tr[{tr}]/td[2]/text()' ).get() role = role.strip() role = role.replace("\n", "") role = re.sub(' +', ' ', role) except: role = "" # Date try: starting_from = Selector(response=page).xpath( f'//*[@id="affiliations-list"]/tbody/tr[{tr}]/td[3]/text()' ).get() starting_from = starting_from.strip() except: starting_from = "" # Documentation try:
publish_day = int(published[0].split(" ")[1]) publish_month = int(months[published[0].split(" ")[0]]) except: publish_year = 0 publish_day = 0 publish_month = 0 if yesterday_day != publish_day or yesterday_month != publish_month: print("Not published yesterday") continue # Ends try: ends = Selector(response=page).xpath( f'/html/body/div[3]/div[1]/div/div/div[1]/div/div[2]/div/article[{div}]/div/div[2]/p/span[4]/time/span[2]/text()' ).get() ends = ends.replace("-", "").strip() ends = ends.strip().split(",") deadline_year = int(ends[1].strip()) deadline_day = int(ends[0].split(" ")[1]) deadline_month = int(months[ends[0].split(" ")[0]]) except: deadline_year = 0 deadline_day = 0 deadline_month = 0 # Logo try: logo = Selector(response=page).xpath( f'/html/body/div[3]/div[1]/div/div/div[1]/div/div[2]/div/article[{div}]/div/div[1]/a/img/@src' ).get() except:
def kinopars(self, name, year, content): content = content.decode('cp1251') content = content.encode('utf8') content = self.resub.sub('', content) def check(obj): if obj: return obj[0] else: return '' xpath = '//link[@rel="canonical"]/@href' id = Selector(text=content).xpath(xpath).extract() if id: id = self.renum.findall(id[0])[0] xpath = '//div[@class="brand_words"][@itemprop="description"]/text()' text = check(Selector(text=content).xpath(xpath).extract()) text = self.req.sub('', text) xpath = '//span[@class="rating_ball"]/text()' rating = check(Selector(text=content).xpath(xpath).extract()) if text and rating and id: print 'was found on kinopoisk.ru/film', id xpath = '//a[@class="popupBigImage"]/img/@src' poster = check(Selector(text=content).xpath(xpath).extract()) if poster == 'http://st.kp.yandex.net/images/movies/poster_none.png': poster = u'false' else: poster = u'true' xpath = '//span[@class="ratingCount"]/text()' count = Selector(text=content).xpath(xpath).extract() if count: count = count[0] count = count.replace(u'\xa0', u'') else: count = 0; print 'rating', rating, count, xpath = '//td[@class="time"]/text()' time = Selector(text=content).xpath(xpath).extract() nulltime = '0:0' if len(time) > 1: time = self.retime.findall(time[1]) if len(time) >= 1: time = time[0] else: time = nulltime elif len(time) == 1: time = self.renum.findall(time[0]) if len(time) >= 1: time = int(time[0]) th = time / 60 tm = time - (th * 60) time = str(th) + ':' + str(tm) else: time = nulltime else: time = nulltime print 'time', time, xpath = '//div[@id="block_rating"]/div[1]/div[2]/text()' imdb = check(Selector(text=content).xpath(xpath).extract()) if imdb: imdb = float(self.rescfl.findall(imdb)[0]) else: imdb = 0; print 'imdb:', imdb head = '(name,year,text,rating,count,imdb,time,kinopoiskid,poster)' values = (name.encode('utf8'), year, text.encode('utf8'), rating, count, imdb, time, id, poster) fid = self.db.insert('ruparser_film', head, values) return fid
def Vacancy_info(url): print(url) page = requests.get(url) # Description try: description = Selector(response=page).xpath( '/html/body/div[2]/div/div[1]/div[2]/div[4]').get() description = remove_tags(description) description = description.rstrip() description = description.lstrip() description = re.sub(r"\s+", " ", description) print(description) except: description = "" if detect(description) == "ru": description_ru = description description_en = Translate(description) description_ka = "" elif detect(description) == "et": description_ru = "" try: description_en = Translate(description) except: description_en = "" description_ka = description else: description_ru = "" description_en = description description_ka = "" # Email try: email = Selector(response=page).xpath( '/html/body/div[2]/div/div[1]/div[2]/div[2]/div[2]/div/div/a/@href' ).get() email = email.replace("mailto:", "") except: email = "" # Location try: location = Selector(response=page).xpath( '/html/body/div[2]/div/div[1]/div[2]/div[3]/div[2]/div[1]/div[2]/span/text()' ).get() location_id = [] try: location_id.append({ "city": f"{location}", "id": f"{Geonames(location)}" }) except: location_id.append({"city": f"{location}", "id": "611717"}) except: location_id = [{"city": "Tbilisi", "id": "611717"}] # Category try: category = Selector(response=page).xpath( '/html/body/div[2]/div/div[1]/div[2]/div[3]/div[2]/div[2]/div[2]/span[1]/text()' ).get() except: category = "" # Stack try: stack = Selector(response=page).xpath( '/html/body/div[2]/div/div[1]/div[2]/div[3]/div[2]/div[4]/div[2]/text()' ).get() if "სრული განაკვეთი" in stack: stack = "Full-Stack" except: stack = "" data = { "description_en": description_en, "description_ka": description_ka, "description_ru": description_ru, "email": email, "location": location_id, "category": category, "stack": stack } print("Vacancy Scraped Succesfully") return data
def parse(self, response): description = response.xpath( "//table[@class='itemlist']/tr[not(re:test(@class, " "'(spacer)'))]").extract() row = self.get_default_row_dict() # print description for i, v in enumerate(description): index = i if not row['rank']: value = Selector(text=v).xpath( '//td[1]/span[@class="rank"]/text()').extract_first() row['rank'] = int(value.replace('.', '')) if value else 0 if not row['story_text']: value = Selector(text=v).xpath( '//td[3]/a[@class="storylink"]/text()').extract_first() row['story_text'] = value.encode("utf8") if value else '' if not row['link_href']: value = Selector(text=v).xpath( '//td[3]/a[@class="storylink"]/@href').extract_first() # print value row['link_href'] = value if value else '' if not row['hn_user']: value = Selector(text=v).xpath( '//a[@class="hnuser"]/text()').extract_first() row['hn_user'] = value.encode("utf8") if value else '' if not row['age']: value = Selector(text=v).xpath( '//span[@class="age"]/a/text()').extract_first() row['age'] = int(value.split(' ')[0]) if value else 0 if not row['total_comments']: value = Selector(text=v).xpath( '//td[@class="subtext"]/a[contains(@href, "item?id=")]/text()' ).extract_first() if value: value = value.encode('ascii', 'ignore').replace( 'comments', '') if value else '' value = value.encode('ascii', 'ignore').replace( 'comment', '') if value else '' row['total_comments'] = int(value) if represents_int( value) else 0 if not row['score']: value = Selector(text=v).xpath( '//span[@class="score"]/text()').extract_first() row['score'] = int(value.split(' ')[0]) if value else 0 if not row['hn_id_code']: value = Selector( text=v).xpath('//tr[@class="athing"]/@id').extract_first() row['hn_id_code'] = int(value) if represents_int(value) else 0 if all([None for i, v in row.items() if v == None]): print 'Go for save >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>' data = row.copy() row = self.get_default_row_dict() self.comment_url.append( 'https://news.ycombinator.com/item?id=15318440') news_id = data['hn_id_code'] item = NewsBotItem(data) yield item request = scrapy.Request( url='https://news.ycombinator.com/item?id=' + str(news_id), callback=self.parse_comment) request.meta['item'] = item request.meta['news_id'] = int(news_id) yield request if index % 2: row = self.get_default_row_dict()
f'//*[@id="MainContentPlaceHolder_jobPageContainer"]/a[{div}]/div/div[2]/div/text()' ).get() location = location.split(",")[0] location_id = [{ "city": f"{location}", "id": f"{Geonames(location)}" }] except: location_id = [{'city': 'Yerevan', 'id': '616052'}] # Publication try: published = Selector(response=page).xpath( f'//*[@id="MainContentPlaceHolder_jobPageContainer"]/a[{div}]/div/div[1]/div[3]/text()' ).get() published = published.replace("Published on ", "").split("/") publish_day = int(published[0]) publish_month = int(published[1]) publish_year = int(published[2]) except: publish_day = 0 publish_month = 0 publish_year = 0 if publish_day != yesterday_day: continue # //*[@id="MainContentPlaceHolder_jobPageContainer"]/a[1]/div/div[1]/div[2] # //*[@id="MainContentPlaceHolder_jobPageContainer"]/a[2]/div/div[1]/div[2] data = { "company": company,
def BiaFunction(company): driver.get(f"https://www.bia.ge/EN") driver.find_element_by_xpath('//*[@id="Filter_Query"]').send_keys( f"{company}") time.sleep(3) try: link = driver.find_element_by_xpath( '/html/body/div[8]/div[2]').get_attribute('data-url') page = requests.get(link) # Company name name = Selector(response=page).xpath( '//*[@id="TrademarksListBox"]/li/text()').get() # Vat number vat_number = Selector(response=page).xpath( '//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[2]/span[2]/text()' ).get() # Address try: address = Selector(response=page).xpath( '//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[2]/span[2]/text()' ).get() raw = address.split(",") postal_code = raw[0] location = raw[1] location = location.lstrip() region = raw[2] appartment = raw[3] city_id = Geonames(location) address = { "location": { "country": "GE", "city": { "id": f"{city_id}", "city": location } }, "postal_code": postal_code, "appartament": appartment, "region": region } except Exception as e: print(e) address = {} # Working hours try: working_hours = Selector(response=page).xpath( '//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[2]/ul/li/text()' ).get() raw = working_hours.split(":", 1) days = raw[0].split("-") till = days[1].lstrip().lower() days = [] for day in weekdays: if day != till: days.append(day) else: days.append(day) break hourfrom = raw[1].split("-")[0] hourfrom = hourfrom.lstrip() hourfrom = hourfrom.rstrip() hourto = raw[1].split("-")[1] hourto = hourto.lstrip() hourto = hourto.rstrip() business_hours = { "week_days": days, "hour_from": hourfrom, "hour_to": hourto } except: business_hours = {} # Foundation Date foundation_date = Selector(response=page).xpath( '//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[2]/span[2]/text()' ).get() # Phone try: phone = Selector(response=page).xpath( '//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').get() phone = remove_tags(phone) if "," in phone: array = phone.split(",") phone = [] for each in array: each = each.lstrip() each = each.rstrip() each = each.split(" ", 1) code = each[0] code = code.replace("+", "") number = each[1] number = number.replace(" ", "") phone.append({"country_code": code, "number": number}) else: phone = phone.lstrip() add = phone.rstrip() add = add.split(" ", 1) code = add[0] code = code.replace("+", "") number = add[1] number = number.replace(" ", "") phone = [{"country_code": code, "number": number}] except: phone = [] # Web try: web = Selector(response=page).xpath( '//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').get() web = remove_tags(web) if "," in web: array = web.split(",") web = [] for each in array: each = each.lstrip() each = each.rstrip() web.append(each) else: web = web.lstrip() add = web.rstrip() web = [add] except: web = [] # Email try: email = Selector( response=page).xpath('//*[@id="TabPanelBox"]').get() email = email.replace("*****@*****.**", "") email = re.findall(r'[\w\.-]+@[\w\.-]+', email) except: email = [] info = { "name": name, "vat": vat_number, "addresses": address, "business_hours": business_hours, "phones": phone, "websites": web, "emails": email, "foundation_date": foundation_date } print("Bia Scraped Successfully") # print(info) return info except: print("No info") return "No info" # driver.find_element_by_xpath('//*[@id="Filter_Query"]').send_keys(Keys.RETURN) # try: # logo = driver.find_element_by_id('LogoImageUploaderBox').get_attribute("style") # except: # logo = "" # print(logo) # try: # name = driver.find_element_by_id('CompanyNameBox').text # except: # name = "" # print(name) # try: # trademarks = driver.find_element_by_xpath('//*[@id="TrademarksListBox"]/li').text # except: # trademarks = "" # print(trademarks) # try: # legal_form = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[1]/span[2]').text # except: # legal_form = "" # print(legal_form) # try: # registration_number = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[1]/span[2]').text # except: # registration_number = "" # print(registration_number) # try: # registration_authority = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[1]/span[2]').text # except: # registration_authority = "" # print(registration_authority) # try: # status = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[1]/span[2]').text # except: # status = "" # print(status) # try: # brands = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[1]/td[2]/span[2]').text # except: # brands = "" # print(brands) # try: # vat_number = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[2]/td[2]/span[2]').text # except: # vat_number = "" # print(vat_number) # try: # registration_date = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[3]/td[2]/span[2]').text # except: # registration_date = "" # print(registration_date) # try: # legal_address = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[4]/td[2]/span[2]').text # except: # legal_address = "" # print(legal_address) # try: # working_hours = driver.find_element_by_xpath('//*[@id="tpAboutCompany"]/table/tbody/tr[5]/td[2]/ul/li').text # except: # working_hours = "" # print(working_hours) # try: # phone = driver.find_element_by_xpath('//*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span').text # except: # phone = "" # print(phone) # try: # website = driver.find_element_by_xpath('//*[@id="ContactsBox"]/table/tbody/tr[3]/td[2]/span').text # except: # website = "" # print(website) # x = mycol.insert_one({ # "Name": name, # "Logo": logo, # "Trademarks": trademarks, # "Legal_Form": legal_form, # "Registration_Number": registration_number, # "Registration_Authority": registration_authority, # "Status": status, # "Brands": brands, # "VAT_Number": vat_number, # "Registration_Date": registration_date, # "Legal_Address": legal_address, # "Working_Hours": working_hours, # "Phone": phone, # "Website": website # }) # driver.find_element_by_xpath('').text # driver.find_element_by_xpath('').text # //*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span/a # //*[@id="ContactsBox"]/table/tbody/tr[2]/td[2]/span
for each in array: each = each.lstrip() each = each.rstrip() web.append(each) else: web = web.lstrip() add = web.rstrip() web = [add] except: web = [] # Email try: email = Selector( response=page).xpath('//*[@id="TabPanelBox"]').get() email = email.replace("*****@*****.**", "") email = re.findall(r'[\w\.-]+@[\w\.-]+', email) except: email = [] # Logo try: logo = Selector(response=page).xpath( '//*[@id="LogoImageUploaderBox"]').get() logo = logo.split("url(\'") logo = logo[1].split("')") logo = logo[0] except: logo = "" info = {
def parse(self, response): productList = Selector(text=response.body).xpath( '//li[contains(@class, "gl-item")]').extract() # $object = UPLOAD_PATH.$new_path.md5(time().mt_rand(100, 999999999)). # '.'.pathinfo($file->getInfo('name'), PATHINFO_EXTENSION); # $new_path = 'goods'.date('Y').'/'.date('m-d').'/'; Class = Selector(text=response.body).xpath( '//div[contains(@class, "p-name p-name-type-2")]//em[not(i)]' ).extract() print(Class) for item in productList: if self.num > self.getNum: break name = Selector(text=item).xpath( '//div[contains(@class, "p-name")]/a/em').extract()[0] name = filterStr.filter_tags(name) skuid = Selector(text=item).xpath('//li/@data-sku').extract()[0] price = Selector(text=item).xpath( '//div[contains(@class, "p-price")]/strong/i').extract()[0] price = filterStr.filter_tags(price) imgsrc = Selector(text=item).xpath( '//li[contains(@class, "gl-item")]//img/@src').extract()[0] imgsrc = imgsrc.replace('//', '') # 去除京东超市 # '京东超市金龙鱼 食用油 葵花籽清香型 食用植物调和油5L(新老包装随机发货)' name = name.replace("京东超市", "") name = name.replace("(京东定制)", "") name = name.replace("(京东定制装)", "") name = name.replace("京东自营", "") name = name.replace("(新老包装随机发货)", "") name = name.replace("新旧包装随机配送", "") name = name.replace("新老包装随机发放", "") name = name.replace("(新老包装随机发放,数量有限,赠完为止)", "") name = name.replace("中粮出品", "") name = name.replace("(中粮出品)", "") if "【沃尔玛】" in name: continue name = name.replace("【沃尔玛】", "") self.item['name'] = name.strip() self.item['price'] = price self.item['skuid'] = skuid # self.item['Class'] = Class self.item['imgsrc'] = imgsrc self.item['sourceType'] = SOURCE_TYPE_JD self.item['goods_id'] = self.insertGoods(self.item) self.num = self.num + 1 yield self.item
def Vacancy(link): url = link headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36", "Accept-Language": "en-US,en;q=0.9,ru;q=0.8" } page = requests.get(url, headers=headers) # Company try: company = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/h4/text()').get() except: company = "" # position try: position = Selector(response=page).xpath( '//*[@id="loyal"]/div[2]/div/div[1]/h4/text()').get() except: position = "" # logo try: logo = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/img/@src').get() except: logo = "" # Job_type try: job_type = Selector(response=page).xpath( '/html/body/div[3]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]//text()[2]' ).get() job_type = job_type.strip() except: job_type = "" # Contact Person try: person = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[2]').get( ) person = person.strip() except: person = "" # Email try: email = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[3]').get( ) email = email.strip() email = [email] except: email = [] # Phone try: phone = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[4]').get( ) phone = phone.strip() if "," in phone: phones = phone.split(",") phone = [] for each in phones: each = each.strip() if "+" in each and " " in each: number = each.split(" ", 1)[1].replace('-', "").replace(" ", "") country_code = each.split(" ", 1)[0].replace('+', "") phone.append({ "country_code": country_code, "number": number }) elif "+" in each and " " not in each: if "+374" in each: country_code = "374" number = each.replace("+374", "") phone.append({ "country_code": country_code, "number": number }) elif "+1" in each: country_code = "1" number = each.replace("+1", "") phone.append({ "country_code": country_code, "number": number }) else: country_code = "374" number = each phone.append({ "country_code": country_code, "number": number }) elif "+" not in each: number = each.replace('-', "").replace(" ", "") country_code = "374" phone.append({ "country_code": country_code, "number": number }) else: if "+" in phone and " " in phone: number = phone.split(" ", 1)[1].replace('-', "").replace(" ", "") country_code = phone.split(" ", 1)[0].replace('+', "") phone = [{"country_code": country_code, "number": number}] elif "+" in phone and " " not in phone: if "+374" in phone: country_code = "374" number = phone.replace("+374", "") phone = [{"country_code": country_code, "number": number}] elif "+1" in phone: country_code = "1" number = phone.replace("+1", "") phone = [{"country_code": country_code, "number": number}] else: country_code = "374" number = phone phone = [{"country_code": country_code, "number": number}] elif "+" not in phone: number = phone.replace('-', "").replace(" ", "") country_code = "374" phone = [{"country_code": country_code, "number": number}] except Exception as e: phone = [] # Website try: website = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[5]').get( ) website = website.strip() if "not" in website: website = [] else: website = [website] except: website = [] # Published try: published = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/text()[2]').get() published = published.strip() publish_day = int(published.split("-")[2]) publish_month = int(published.split("-")[1]) publish_year = int(published.split("-")[0]) except: publish_day = 0 publish_month = 0 publish_year = 0 # Ends try: ends = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/text()[5]').get() ends = ends.strip() deadline_day = int(ends.split("-")[2]) deadline_month = int(ends.split("-")[1]) deadline_year = int(ends.split("-")[0]) except: deadline_day = 0 deadline_month = 0 deadline_year = 0 # Career Level try: career_level = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[1]/text()').get( ) if career_level == None: career_level = "" except: career_level = "" # Education try: education = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[2]/text()').get( ) if education == None: education = "" except: education = "" # Experience try: experience = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[3]/text()').get( ) if experience == None: experience = "" except: experience = "" # Salary try: salary = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/strong/text()').get( ) if "-" in salary: salary = salary.split("-") min_salary = salary[0].strip() min_salary = int(min_salary.replace(".", "")) max_salary = salary[1].strip() max_salary = int(max_salary.replace('.', "")) elif "-" not in salary and salary != "N/A": min_salary = int(salary.replace(".")) max_salary = int(salary.replace(".")) else: min_salary = 0 max_salary = 0 except: min_salary = 0 max_salary = 0 # Vacancy Description try: v_description = Selector( response=page).xpath('//*[@id="loyal"]/div[2]/div/div[1]').get() v_description = remove_tags(v_description).strip() v_description = v_description.replace('\xa0', " ") except: v_description = "" try: if detect(v_description) == "et": try: v_description_en = Translate(v_description) except: v_description_en = " " v_description_am = v_description else: v_description_en = v_description v_description_am = "" except: v_description_am = "" v_description_en = "" # Company Description try: c_description = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/p/text()').get() c_description = c_description.strip() except: c_description = "" try: if detect(c_description) == "et": try: c_description_en = Translate(c_description) except: c_description_en = " " c_description_am = c_description else: c_description_en = c_description c_description_am = "" except: c_description_am = "" c_description_en = "" # c_descrip ; //*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/p/text() data = { "company": company, "position": position, "logo": logo, "person": person, "job_type": job_type, "email": email, "phone": phone, "website": website, "publish_day": publish_day, "publish_month": publish_month, "publish_year": publish_year, "deadline_day": deadline_day, "deadline_month": deadline_month, "deadline_year": deadline_year, "career_level": career_level, "education": education, "experience": experience, "min_salary": min_salary, "max_salary": max_salary, "v_description_am": v_description_am, "v_description_en": v_description_en, "c_description_am": c_description_am, "c_description_en": c_description_en, } print(data) return data # Vacancy("https://rezume.am/job/2184")