def parse_next(self, response): auxs = response.xpath('//div[@class="news-results-items"]//div[@class="media"]') for aux in auxs: item = SwisscomIvCrawlerItem() #item['PUBSTRING'] = aux.xpath('./td//div[@class="datetime"]/text()').extract_first() # cuts out the part berfore the date as well as the /n at the end of the string item['HEADLINE']= aux.xpath('.//h3/a/text()').extract_first() item['DOCLINK']= aux.xpath('.//h3/a/@href').extract_first() #item = { # 'PUBSTRING': aux.xpath('./p[@class="news-card-date"]//text()').extract()[1], # 'HEADLINE': aux.xpath('.//h3[@class="news-card-title"]/a//text()').extract_first(), # 'DOCLINK': aux.xpath('.//h3[@class="news-card-title"]/a/@href').extract_first(), # } base_url = 'https://www.centerpointenergy.com' aux_url = aux.xpath('.//h3/a/@href').extract_first() if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url= aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url= base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url= aux_url request = SplashRequest(url=url, splash_headers={'Authorization': basic_auth_header('535209af07354fbbb4110611b27f7504', '')}, args={'wait': 0.5, 'timeout':15}, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/201,00101 Firefox/62.0'}, callback=self.parse_details) #scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url= base_url + aux_url request = SplashRequest(url=url, splash_headers={'Authorization': basic_auth_header('535209af07354fbbb4110611b27f7504', '')}, args={'wait': 0.5, 'timeout':15}, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/201,00101 Firefox/62.0'}, callback=self.parse_details) #scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse(self, response): body = json.loads(response.text) for dat in body['NewsReleases']['NewsRelease']: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = dat[ 'Date'] # cuts out the part berfore the date as well as the /n at the end of the string item['HEADLINE'] = dat['Title'] item['DOCLINK'] = dat['@attributes']['ReleaseID'] #item = { # 'PUBSTRING': aux.xpath('./p[@class="news-card-date"]//text()').extract()[1], # 'HEADLINE': aux.xpath('.//h3[@class="news-card-title"]/a//text()').extract_first(), # 'DOCLINK': aux.xpath('.//h3[@class="news-card-title"]/a/@href').extract_first(), # } base_url = 'https://churchdwight.com/latest-news-article.aspx?id=' aux_url = dat['@attributes']['ReleaseID'] if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url = aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url = base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url = aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url = base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse(self, response): body = json.loads(response.text) #auxs = Selector(text=body['display']).xpath('//div[@class="view-content"]/div[@class="item-list"][1]/ul/li') for dat in body['GetPressReleaseListResult']: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = dat[ 'PressReleaseDate'] # cuts out the part berfore the date as well as the /n at the end of the string item['HEADLINE'] = dat['Headline'] item['DOCLINK'] = dat['LinkToDetailPage'] # 'PUBSTRING': aux.xpath('./td//div[@class="field__item"]/text()').extract_first(), # 'HEADLINE': aux.xpath('./td/a[2]/text()').extract_first(), # 'DOCLINK': aux.xpath('./td/a[2]/@href').extract_first(), # } base_url = 'https://newsroom.thehartford.com' aux_url = item['DOCLINK'] if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url = aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url = base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url = aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url = base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse_next(self, response): data = json.loads(response.text) #data_aux = data['records']['prod-ember-content'] for prod in data['GetPressReleaseListResult']: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = prod['PressReleaseDate'] item['HEADLINE'] = prod['Headline'] item['DOCLINK'] = prod['LinkToDetailPage'] #item = { # 'PUBSTRING': prod['PressReleaseDate'], # 'HEADLINE' : prod['Headline'], # 'DOCLINK' : prod['LinkToDetailPage'], # } base_url = 'https://investor.corning.com' aux_url = prod['LinkToDetailPage'] if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url = aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url = base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url = aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url = base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse_next(self, response): body = json.loads( response.text) # load jason response from post request for dat in body['NewsPosts']: #transform 13 digit unit date timestamp = dat['Date'].split('(')[1].split(')')[0] your_dt = datetime.datetime.fromtimestamp(int(timestamp) / 1000) item = SwisscomIvCrawlerItem() item['PUBSTRING'] = your_dt.strftime("%Y-%m-%d %H:%M:%S") item['HEADLINE'] = dat['Title'] item['DOCLINK'] = dat['DetailsPageUrl'] base_url = 'https://www.eversource.com/content' aux_url = base_url + dat['DetailsPageUrl'][ 1:] # omitts the first element from the string if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url = aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url = base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url = aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url = base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse_next(self, response): auxs = response.xpath('//table//tr[not(ancestor::thead)]') for aux in auxs: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux.xpath( './td/div[@class="news-release-head"]/text()').extract_first() item['HEADLINE'] = aux.xpath('./td/a/text()').extract_first() item['DOCLINK'] = aux.xpath('./td/a/@href').extract_first() #item = { # 'PUBSTRING': aux.xpath('./td/div[@class="news-release-head"]/text()').extract_first(), # 'HEADLINE': aux.xpath('./td/a/text()').extract_first(), # 'DOCLINK': aux.xpath('./td/a/@href').extract_first(), # } base_url = 'https://cernercorporation.gcs-web.com' aux_url = aux.xpath('./td/a/@href').extract_first() if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url = aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url = base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url = aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url = base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse_next(self, response): auxs = response.xpath('//div[contains(@class,"irwLoadingdata")]') for aux in auxs: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux.xpath( './div[1]//div[@class="irwPRDate"]/text()').extract_first() item['HEADLINE'] = aux.xpath('.//h4/a/text()').extract_first() item['DOCLINK'] = aux.xpath('.//h4/a/@href').extract_first() #item = { # 'PUBSTRING': aux.xpath('./td//div[@class="field__item"]/text()').extract_first(), # 'HEADLINE': aux.xpath('./td/a[2]/text()').extract_first(), # 'DOCLINK': aux.xpath('./td/a[2]/@href').extract_first(), # } base_url = 'http://investors.essexapartmenthomes.com' aux_url = aux.xpath('.//h4/a/@href').extract_first() if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url = aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url = base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url = aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url = base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse(self, response): body = json.loads(response.body.decode('utf-8')) for aux in body[0:20]: item = SwisscomIvCrawlerItem() timestamp = aux['PublishDate'].split('Date(')[1].split(')')[0] your_dt = datetime.datetime.fromtimestamp(int(timestamp)/1000) item['PUBSTRING'] = your_dt.strftime("%Y-%m-%d %H:%M:%S") # cuts out the part berfore the date as well as the /n at the end of the string item['HEADLINE']= aux['DocumentTitle'] item['DOCLINK']= aux['ContentUrl'] #item = { # 'PUBSTRING': aux.xpath('./p[@class="news-card-date"]//text()').extract()[1], # 'HEADLINE': aux.xpath('.//h3[@class="news-card-title"]/a//text()').extract_first(), # 'DOCLINK': aux.xpath('.//h3[@class="news-card-title"]/a/@href').extract_first(), # } base_url = 'https://ww3.jbhunt.com' aux_url = item['DOCLINK'] if '.pdf' in aux_url.lower() or 'static-files' in aux_url.lower(): if aux_url.startswith('http'): url= aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url= base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url= aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url= base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse(self, response): body = json.loads(response.text) for aux in body['Record'][0:30]: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux[ 'date'] # cuts out the part berfore the date as well as the /n at the end of the string item['HEADLINE'] = aux['title'] item['DOCLINK'] = aux['releaseId'] #item = { # 'PUBSTRING': aux.xpath('./p[@class="news-card-date"]//text()').extract()[1], # 'HEADLINE': aux.xpath('.//h3[@class="news-card-title"]/a//text()').extract_first(), # 'DOCLINK': aux.xpath('.//h3[@class="news-card-title"]/a/@href').extract_first(), # } base_url = 'https://www.thehersheycompany.com/bin/ssf/nasdaqNewsRelease?releaseId=' aux_url = aux['releaseId'] if '.pdf' in aux_url.lower() or 'static-files' in aux_url.lower(): if aux_url.startswith('http'): url = aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url = base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url = aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url = base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse_next(self, response): auxs = response.xpath('//table//tr') for aux in auxs: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux.xpath('.//td/div[@class="nir-widget--field nir-widget--news--date-time"]/text()').extract_first()# cuts out the part berfore the date as well as the /n at the end of the string item['HEADLINE']= aux.xpath('.//td/div[@class="nir-widget--field nir-widget--news--headline"]//a/text()').extract_first() item['DOCLINK']= aux.xpath('.//td/div[@class="nir-widget--field nir-widget--news--headline"]/a/@href').extract_first() #item = { # 'PUBSTRING': aux.xpath('.//td/div[@class="nir-widget--field nir-widget--news--date-time"]/text()').extract_first(), # 'HEADLINE': aux.xpath('.//td/div[@class="nir-widget--field nir-widget--news--headline"]//a/text()').extract_first(), # 'DOCLINK': aux.xpath('.//td/div[@class="nir-widget--field nir-widget--news--headline"]/a/@href').extract_first(), # } base_url = 'https://investors.nov.com' aux_url = aux.xpath('.//td/div[@class="nir-widget--field nir-widget--news--headline"]/a/@href').extract_first() if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url= aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url= base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url= aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url= base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse_next(self, response): auxs = response.xpath('//div[@class="item-list"]/ul/li[not(ancestor::div[@id="FooterWrapper"])]') for aux in auxs: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux.xpath('.//div[contains(@class, "name-field-nir-news-date")]/div/text()').extract_first() item['HEADLINE']= aux.xpath('.//div[contains(@class, "nir-news-title")]/div/a[2]/text()').extract_first() item['DOCLINK']= aux.xpath('.//div[contains(@class, "nir-news-title")]/div/a[2]/@href').extract_first() #item = { # 'PUBSTRING': aux.xpath('./p[@class="news-card-date"]//text()').extract()[1], # 'HEADLINE': aux.xpath('.//h3[@class="news-card-title"]/a//text()').extract_first(), # 'DOCLINK': aux.xpath('.//h3[@class="news-card-title"]/a/@href').extract_first(), # } base_url = 'https://iff.gcs-web.com' aux_url = aux.xpath('.//div[contains(@class, "nir-news-title")]/div/a[2]/@href').extract_first() if not aux_url: yield item else: if '.pdf' in aux_url.lower() or 'static-files' in aux_url.lower(): if aux_url.startswith('http'): url= aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url= base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url= aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url= base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse_next(self, response): auxs = response.xpath('//table//tr[not(ancestor::thead)]') for aux in auxs: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux.xpath('.//time/text()').extract_first( ) # cuts out the part berfore the date as well as the /n at the end of the string item['HEADLINE'] = aux.xpath('./td[2]/a/text()').extract_first() item['DOCLINK'] = aux.xpath('./td[2]/a/@href').extract_first() #item = { # 'PUBSTRING': aux.xpath('./p[@class="news-card-date"]//text()').extract()[1], # 'HEADLINE': aux.xpath('.//h3[@class="news-card-title"]/a//text()').extract_first(), # 'DOCLINK': aux.xpath('.//h3[@class="news-card-title"]/a/@href').extract_first(), # } base_url = 'http://investors.nblenergy.com' aux_url = item['DOCLINK'] if '.pdf' in aux_url.lower() or 'static-files' in aux_url.lower(): if aux_url.startswith('http'): url = aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url = base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url = aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url = base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse(self, response): #dat = json.loads(response.text) # load jason response from post request #body = dat[-1]['data'] # [-1] selects last element # extract data body with html content from the json response file quotes = response.xpath('//div[@class="col-sm-10"]') # define html body content as reference for the selector for quote in quotes: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = quote.xpath('.//div[@class="irwPRDate"]/text()').extract_first() item['HEADLINE']= quote.xpath('./h4/a/text()').extract_first() item['DOCLINK']= quote.xpath('.//div[@class="irwPRSummary irwHidden"]/a/@href').extract_first() #item = { # 'PUBSTRING': quote.xpath('.//div[@class="irwPRDate"]/text()').extract_first(), # 'HEADLINE': quote.xpath('.//div[@class="irwPRSummary irwHidden"]/p/text()').extract_first(), # 'DOCLINK': quote.xpath('.//div[@class="irwPRSummary irwHidden"]/a/@href').extract_first(), # } base_url = 'http://investors.avalonbay.com' aux_url = quote.xpath('.//div[@class="irwPRSummary irwHidden"]/a/@href').extract_first() if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url= aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url= base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url= aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url= base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse_next(self, response): auxs = response.xpath('//div[contains(@class, "ModuleItemRow")]') for aux in auxs: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux.xpath( './/span[@class="ModuleDate"]/text()').extract_first() item['HEADLINE'] = aux.xpath( './/span[@class="ModuleHeadline"]/text()').extract_first() item['DOCLINK'] = aux.xpath( './/a[@class="ModuleHeadlineLink"]/@href').extract_first() #item = { # 'PUBSTRING': aux.xpath('./td//div[@class="field__item"]/text()').extract_first(), # 'HEADLINE': aux.xpath('./td/a[2]/text()').extract_first(), # 'DOCLINK': aux.xpath('./td/a[2]/@href').extract_first(), # } base_url = 'https://investors.mgmresorts.com' aux_url = item['DOCLINK'] if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url = aux_url item['file_urls'] = [url] item['DOCLINK'] = url yield item else: url = base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url yield item else: if aux_url.startswith('http'): url = aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url = base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse(self, response): body = json.loads(response.text) for dat in body['results']: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = dat['properties'][ 'dcterms:created'] # cuts out the part berfore the date as well as the /n at the end of the string item['HEADLINE'] = dat['linktext'] item['DOCLINK'] = dat['pagepath'] #item = { # 'PUBSTRING': aux.xpath('./p[@class="news-card-date"]//text()').extract()[1], # 'HEADLINE': aux.xpath('.//h3[@class="news-card-title"]/a//text()').extract_first(), # 'DOCLINK': aux.xpath('.//h3[@class="news-card-title"]/a/@href').extract_first(), # } base_url = 'https:/' aux_url = dat['pagepath'] if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url = aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url = base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url = aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url = base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse(self, response): body = json.loads(response.text) data = body['d']['htmlData'] auxs = Selector(text=data).xpath('//div[contains(@class, "item")]') for aux in auxs: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux.xpath('.//span[@class="date"]/text()').extract_first() # cuts out the part berfore the date as well as the /n at the end of the string item['HEADLINE']= aux.xpath('.//h4/a/text()').extract_first() item['DOCLINK']= aux.xpath('.//h4/a/@href').extract_first() #item = { # 'PUBSTRING': aux.xpath('./p[@class="news-card-date"]//text()').extract()[1], # 'HEADLINE': aux.xpath('.//h3[@class="news-card-title"]/a//text()').extract_first(), # 'DOCLINK': aux.xpath('.//h3[@class="news-card-title"]/a/@href').extract_first(), # } base_url = 'https://www.snapon.com' aux_url = item['DOCLINK'] if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url= aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url= base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url= aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url= base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse(self, response): body = json.loads(response.text) # load jason response from post request #body = dat[-1]['data'] # [-1] selects last element # extract data body with html content from the json response file #quotes = Selector(text=body).xpath('//div[@class="views-row"]') # define html body content as reference for the selector for dat in body['GetPressReleaseListResult']: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = dat['PressReleaseDate'] item['HEADLINE']= dat['Headline'] item['DOCLINK']= dat['LinkToDetailPage'] #item = { # 'PUBSTRING': dat['PressReleaseDate'], # 'HEADLINE': dat['Headline'], # 'DOCLINK': dat['LinkToDetailPage'], # } base_url = 'http://ir.ingersollrand.com' aux_url = dat['LinkToDetailPage'] if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url= aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url= base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url= aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url= base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse_next(self, response): auxs = response.xpath('//div[contains(@class, "view-id-widget_news")]//tr[@class="news-release"]') for aux in auxs: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux.xpath('.//div[contains(@class, "datetimezone")]/div/text()').extract_first() # cuts out the part berfore the date as well as the /n at the end of the string item['HEADLINE']= aux.xpath('.//div[@class="news-release-head"]/a/text()').extract_first() item['DOCLINK']= aux.xpath('.//div[@class="news-release-head"]/a/@href').extract_first() #item = { # 'PUBSTRING': aux.xpath('./p[@class="news-card-date"]//text()').extract()[1], # 'HEADLINE': aux.xpath('.//h3[@class="news-card-title"]/a//text()').extract_first(), # 'DOCLINK': aux.xpath('.//h3[@class="news-card-title"]/a/@href').extract_first(), # } base_url = 'https://www.snapon.com/EN/Investors/News-Releases/News-Release?cpath=' aux_url = item['DOCLINK'] if aux_url.startswith('/EN'): base_url = 'https://www.snapon.com' if '.pdf' in aux_url.lower() or 'static-files' in aux_url.lower(): if aux_url.startswith('http'): url= aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url= base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url= aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url= base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse_next(self, response): auxs = response.xpath('//li[@class="wd_item"]') for aux in auxs: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux.xpath( './/div[@class="wd_date"]/text()').extract_first() item['HEADLINE'] = aux.xpath( './/div[@class="wd_title"]/a/text()').extract_first() item['DOCLINK'] = aux.xpath( './/div[@class="wd_title"]/a/@href').extract_first() #item = { # 'PUBSTRING': aux.xpath('./td//div[@class="field__item"]/text()').extract_first(), # 'HEADLINE': aux.xpath('./td/a[2]/text()').extract_first(), # 'DOCLINK': aux.xpath('./td/a[2]/@href').extract_first(), # } base_url = 'https://ir.chipotle.com' aux_url = item['DOCLINK'] if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url = aux_url item['file_urls'] = [url] item['DOCLINK'] = url yield item else: url = base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url yield item else: if aux_url.startswith('http'): url = aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url = base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse(self, response): auxs = response.xpath('//div[@id="MainContent_C004"]/ul/li') for aux in auxs[0:25]: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux.xpath('./div/text()').extract_first( ) # cuts out the part berfore the date as well as the /n at the end of the string item['HEADLINE'] = aux.xpath('./h2/a/text()').extract_first() item['DOCLINK'] = aux.xpath('./h2/a/@href').extract_first() #item = { # 'PUBSTRING': aux.xpath('./p[@class="news-card-date"]//text()').extract()[1], # 'HEADLINE': aux.xpath('.//h3[@class="news-card-title"]/a//text()').extract_first(), # 'DOCLINK': aux.xpath('.//h3[@class="news-card-title"]/a/@href').extract_first(), # } base_url = 'https://www.dovercorporation.com/news/' aux_url = aux.xpath('./h2/a/@href').extract_first() if '.pdf' in aux_url.lower() or 'static-files' in aux_url.lower(): if aux_url.startswith('http'): url = aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url = base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url = aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url = base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse_next(self, response): auxs = response.xpath('//div[@class="region__inner"]/section/ul/li') for aux in auxs: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux.xpath('.//time/text()').extract_first() # cuts out the part berfore the date as well as the /n at the end of the string item['HEADLINE']= aux.xpath('.//h3/a/text()').extract_first() item['DOCLINK']= aux.xpath('.//h3/a/@href').extract_first() #item = { # 'PUBSTRING': aux.xpath('./p[@class="news-card-date"]//text()').extract()[1], # 'HEADLINE': aux.xpath('.//h3[@class="news-card-title"]/a//text()').extract_first(), # 'DOCLINK': aux.xpath('.//h3[@class="news-card-title"]/a/@href').extract_first(), # } base_url = 'https://www.akamai.com' aux_url = aux.xpath('.//h3/a/@href').extract_first() if not aux_url: continue if '.pdf' in aux_url.lower() or 'static-files' in aux_url.lower(): if aux_url.startswith('http'): url= aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url= base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url= aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url= base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse_next(self, response): auxs = response.xpath('//table//tr[not(self::*[@class="trHeaders"] or descendant::*[@class="trHeaders"] )]') for aux in auxs: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux.xpath('./th[@class="pr-date-field"]/text()[2]').extract_first() # cuts out the part berfore the date as well as the /n at the end of the string item['HEADLINE']= aux.xpath('./td[@class="pr-title-field"]/a//text()[not(ancestor::span)]').extract_first() item['DOCLINK']= aux.xpath('./td[@class="pr-title-field"]/a/@href').extract_first() base_url = 'http://investor.zimmerbiomet.com' aux_url = aux.xpath('./td[@class="pr-title-field"]/a/@href').extract_first() if not item['HEADLINE']: item['DOCLINK']= aux.xpath('./td[@class="pr-document-field"]/a/@href').extract_first() item['HEADLINE']= aux.xpath('./td[@class="pr-title-field"]/text()[2][not(ancestor::span)]').extract_first() base_url = 'http://investor.zimmerbiomet.com' aux_url = aux.xpath('./td[@class="pr-document-field"]/a/@href').extract_first() if 'pdf' in aux_url.lower(): if aux_url.startswith('http'): url= aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url= base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url= aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url= base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse_next(self, response): auxs = response.xpath('//div[@class="nir-widget--list"]/article') if len(auxs) > 20: auxs = auxs[0:20] for aux in auxs: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux.xpath( './/div[contains(@class, "date-time")]//text()').extract_first( ) item['HEADLINE'] = aux.xpath('./a[2]/text()').extract_first() item['DOCLINK'] = aux.xpath('./a[2]/@href').extract_first() base_url = 'http://investor.xilinx.com' aux_url = aux.xpath('./a[2]/@href').extract_first() if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url = aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url = base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url = aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url = base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse_next(self, response): body = json.loads(response.body.decode('utf-8')) for aux in body['items']: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux[ 'date'] # cuts out the part berfore the date as well as the /n at the end of the string item['HEADLINE'] = aux['title'] item['DOCLINK'] = aux['link']['href'] #item = { # 'PUBSTRING': aux.xpath('./p[@class="news-card-date"]//text()').extract()[1], # 'HEADLINE': aux.xpath('.//h3[@class="news-card-title"]/a//text()').extract_first(), # 'DOCLINK': aux.xpath('.//h3[@class="news-card-title"]/a/@href').extract_first(), # } base_url = 'https://news.dentsplysirona.com' aux_url = item['DOCLINK'] if '.pdf' in aux_url.lower() or 'static-files' in aux_url.lower(): if aux_url.startswith('http'): url = aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url = base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url = aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url = base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse(self, response): auxs = response.xpath('//ul/li[contains(@class, "headline__item")]') for aux in auxs: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux.xpath('./a/p/text()').extract_first( ) # cuts out the part berfore the date as well as the /n at the end of the string item['HEADLINE'] = aux.xpath('./a/h3/text()').extract_first() item['DOCLINK'] = aux.xpath('./a/@href').extract_first() #item = { # 'PUBSTRING': aux.xpath('./p[@class="news-card-date"]//text()').extract()[1], # 'HEADLINE': aux.xpath('.//h3[@class="news-card-title"]/a//text()').extract_first(), # 'DOCLINK': aux.xpath('.//h3[@class="news-card-title"]/a/@href').extract_first(), # } base_url = 'https://corporate.goodyear.com' aux_url = item['DOCLINK'] if '.pdf' in aux_url.lower() or 'static-files' in aux_url.lower(): if aux_url.startswith('http'): url = aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url = base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url = aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url = base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse(self, response): body = json.loads(response.text) for dat in body['GetPressReleaseListResult']: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = dat[ 'PressReleaseDate'] # cuts out the part berfore the date as well as the /n at the end of the string item['HEADLINE'] = dat['Headline'] item['DOCLINK'] = dat['LinkToDetailPage'] #item = { # 'PUBSTRING': aux.xpath('./p[@class="news-card-date"]//text()').extract()[1], # 'HEADLINE': aux.xpath('.//h3[@class="news-card-title"]/a//text()').extract_first(), # 'DOCLINK': aux.xpath('.//h3[@class="news-card-title"]/a/@href').extract_first(), # } base_url = 'https://ir.berkley.com' aux_url = item['DOCLINK'] if '.pdf' in aux_url.lower() or 'static-files' in aux_url.lower(): if aux_url.startswith('http'): url = aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url = base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url = aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url = base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse_next(self, response): body = json.loads(response.text) auxs = Selector(text=body['display']).xpath('//div[@class="view-content"]/div[@class="item-list"][1]/ul/li') for aux in auxs: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux.xpath('.//span[contains(@class, "views-field-created")]/span[@class="field-content"]/text()').extract_first() item['HEADLINE']= aux.xpath('.//span[@class="field-content"]/a/h3/text()').extract_first() item['DOCLINK']= aux.xpath('.//div[contains(@class, "views-field-title")]/span[@class="field-content"]/a/@href').extract_first() #item = { # 'PUBSTRING': aux.xpath('./td//div[@class="field__item"]/text()').extract_first(), # 'HEADLINE': aux.xpath('./td/a[2]/text()').extract_first(), # 'DOCLINK': aux.xpath('./td/a[2]/@href').extract_first(), # } base_url = 'https://newsroom.thehartford.com' aux_url = aux.xpath('.//div[contains(@class, "views-field-title")]/span[@class="field-content"]/a/@href').extract_first() if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url= aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url= base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url= aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url= base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse_next(self, response): body = json.loads(response.text) #auxs = response.xpath('//div[@class="node__content"]//div[@class="views-row"]//article') for dat in body['GetPressReleaseListResult']: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = dat['PressReleaseDate'] item['HEADLINE']= dat['Headline'] item['DOCLINK']= dat['LinkToDetailPage'] #item = { # 'PUBSTRING': aux.xpath('./td//div[@class="field__item"]/text()').extract_first(), # 'HEADLINE': aux.xpath('./td/a[2]/text()').extract_first(), # 'DOCLINK': aux.xpath('./td/a[2]/@href').extract_first(), # } base_url = 'https://ir.tyson.com' aux_url = dat['LinkToDetailPage'] if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url= aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url= base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url= aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url= base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse_next(self, response): auxs = response.xpath('//div[@class="result-details"]') for aux in auxs[0:20]: item = SwisscomIvCrawlerItem() item['PUBSTRING'] = aux.xpath( './/div[@class="press-release-date pure-u-1 pure-u-sm-1-4"]/text()' ).extract_first() item['HEADLINE'] = aux.xpath('.//h3/a/text()').extract_first() item['DOCLINK'] = aux.xpath('.//h3/a/@href').extract_first() #item = { # 'PUBSTRING': aux.xpath('./td//div[@class="field__item"]/text()').extract_first(), # 'HEADLINE': aux.xpath('./td/a[2]/text()').extract_first(), # 'DOCLINK': aux.xpath('./td/a[2]/@href').extract_first(), # } base_url = 'https://www.xylem.com' aux_url = item['DOCLINK'] if '.pdf' in aux_url.lower(): if aux_url.startswith('http'): url = aux_url item['file_urls'] = [url] item['DOCLINK'] = url yield item else: url = base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url yield item else: if aux_url.startswith('http'): url = aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url = base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request
def parse(self, response): auxs = response.xpath('//ul[@class="linklist"]/li') for aux in auxs: item = SwisscomIvCrawlerItem() date_reg = r'.*\d{2}.\s*\d{4}' item['PUBSTRING'] = re.search(date_reg, aux.xpath('./a/text()').extract_first()).group(0) # cuts out the part berfore the date as well as the /n at the end of the string item['HEADLINE']= aux.xpath('./a/text()').extract_first() item['DOCLINK']= aux.xpath('./a/@href').extract_first() #item = { # 'PUBSTRING': aux.xpath('./p[@class="news-card-date"]//text()').extract()[1], # 'HEADLINE': aux.xpath('.//h3[@class="news-card-title"]/a//text()').extract_first(), # 'DOCLINK': aux.xpath('.//h3[@class="news-card-title"]/a/@href').extract_first(), # } base_url = 'https://investors.globelifeinsurance.com' aux_url = item['DOCLINK'] if '.pdf' in aux_url.lower() or 'static-files' in aux_url.lower(): if aux_url.startswith('http'): url= aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: url= base_url + aux_url item['file_urls'] = [url] item['DOCLINK'] = url item['DESCRIPTION'] = '' yield item else: if aux_url.startswith('http'): url= aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request else: url= base_url + aux_url request = scrapy.Request(url=url, callback=self.parse_details) request.meta['item'] = item yield request