def parse(self, response): all_item = [] print('-----------------------------------') print(response.url) print('-----------------------------------') # item = FinanceCrawlItem() json_datas = json.loads(response.text) datas = json_datas['items']['data'] pattern = re.compile("<mark>.*</mark>") for data in datas: item = FinanceCrawlItem() time_num = int(data['publishAt']) date = datetime.fromtimestamp(time_num).strftime("%Y%m%d") content = pattern.search(data['content']) content = content.group() if content: content = content.replace('<mark>', '').replace('</mark>', '') item['company'] = content else: item['company'] = response.meta['company'] item['date'] = date item['title'] = data['title'] item['link'] = 'https://news.cnyes.com/news/id/' + str( data['newsId']) item['due_date'] = response.meta['date'] all_item.append(item) return all_item
def parse_product(self, response): item = FinanceCrawlItem() # print(response.url) source = BeautifulSoup(response.text, 'lxml') title = source.select_one('h1.margin_b20').text time = source.select_one('div.icon_time').text text = source.select_one('div#news_detail_div').text.replace( '\n', '').replace('\t\t\t\t\t \t\t', '') item['time'] = time item['title'] = title item['link'] = response.url item['text'] = text return item
def parse_product(self, response): source = BeautifulSoup(response.text, 'lxml') item = FinanceCrawlItem() title = source.select_one('h1').text time = source.select_one('div.ndArticle_creat').text.replace( '出版時間:', '') texts = source.select('div.ndArticle_margin p') text = texts[0].text.replace('\xa0', '').replace('\t', '') if text == '': text = texts[1].text.replace('\xa0', '').replace('\t', '') item['time'] = time item['title'] = title item['link'] = response.url item['text'] = text return item
def parse_product(self, response): item = FinanceCrawlItem() title = response.meta['title'] link = response.meta['link'] source = BeautifulSoup(response.text, 'lxml') time = source.select_one('span.time').text texts = source.select('div.text p')[:-4] text = '' for t in texts: text = text + t.text item['time'] = time item['title'] = title item['link'] = link item['text'] = text return item
def parse_product(self, response): item = FinanceCrawlItem() title = response.meta['title'] link = response.meta['link'] source = BeautifulSoup(response.text, 'lxml') time = source.select_one('time.entry-date').text all_text = source.select('div.td-post-content p') text ='' for t in all_text[1:]: if 'figure' not in t: text = text+t.text item['title'] = title item['link'] = link item['time'] = time item['text'] = text return item
def parse(self, response): item = FinanceCrawlItem() due_date = response.meta['date'] company = response.meta['company'] source = BeautifulSoup(response.text, 'lxml') news_tags = source.select('div.newsimg-area-item-2 ') for i in news_tags: url = i.select_one('a.gt').get('href') title = i.select_one('div.newsimg-area-text-2 ').text news_date = i.select_one('div.label-area div.newsimg-date').text[:11] print(title, url, news_date, due_date) item['company'] = company item['date'] = news_date item['due_date'] = due_date item['title'] = title item['link'] = 'https://www.setn.com/'+url return item # if # for tag in news_a_tag: # meta = {} # meta['link'] = 'https://www.setn.com' + tag.get('href') # item['title'] = tag.text # item['link'] = link # item['time'] = time # item['date'] = text # yield scrapy.Request(meta['link'], callback = self.parse_product, meta=meta) # self.page_num += 1 # if self.page_num != 25: # yield scrapy.Request('https://www.setn.com/ViewAll.aspx?PageGroupID=2&p='+str(self.page_num), callback = self.parse) # else: # print('done') # def parse_product(self, response): # item = FinanceCrawlItem() # title = response.meta['title'] # link = response.meta['link'] # source = BeautifulSoup(response.text, 'lxml') # time = source.select_one('time.page-date').text # texts = source.select('article div#Content1 p') # text = '' # for t in texts: # text += t.text # item['title'] = title # item['link'] = link # item['time'] = time # item['text'] = text # return item
def parse_product(self, response): item = FinanceCrawlItem() title = response.meta['title'] link = response.meta['link'] source = BeautifulSoup(response.text, 'lxml') time = source.select_one('time.date') if time: time = time.get('datetime') else: time = source.select_one('time.news-time').get('datetime') texts = source.select('div.story p') text = '' for t in texts: text = text + t.text item['time'] = time[:-9] item['title'] = title.replace('\u3000', '') item['link'] = link item['text'] = text return item