def crawler_news(self): try: response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = json.loads(response.content.decode('utf-8')) if news_data: for news in news_data['GetPressReleaseListResult']: news_dict = Helper.get_news_dict() title = news['Headline'] if 'Headline' in news else "" url = news['LinkToUrl'] if 'LinkToUrl' in news else "" # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue description = news[ 'ShortBody'] if 'ShortBody' in news else "" news_url_uid = news[ 'PressReleaseId'] if 'PressReleaseId' in news else "" publish_date = Helper.parse_date( news['PressReleaseDate'] ) if 'PressReleaseDate' in news else "" news_dict.update({ "title": title, "url": url, "formatted_sub_header": title, "description": description, "link": url, "publishedAt": publish_date, 'date': publish_date, "news_url_uid": news_url_uid, "news_provider": "ppg" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: loop = True count = 0 while loop: response = crawler.MakeRequest(self.url.format(count=count),'Get',postData=self.body,headers=self.header) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection) news_data = soup.find_all('li', {'class': "wd_item"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('div',{'class','wd_title'}) title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = url_data['href'] if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData(self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue description_data = news.find('div',{'class':'wd_subtitle'}) description = description_data.text if description_data else '' text_data = news.find('div', {'class': 'wd_summary'}) text = text_data.text if text_data else '' publish_date_data = news.find('div',{'class':'wd_date'}) publish_date = Helper.parse_date(publish_date_data.text) if publish_date_data and publish_date_data.text != '' else '' news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "martin_corporation", "ticker": "martin_corporation_scrapped", "industry_name": "martin_corporation", "news_provider": "martin_corporation"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() count += 100 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n",exc_info=True)
def crawler_news(self): try: response = crawler.MakeRequest(self.url,'Get',postData=self.body,headers=self.headers) bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection) news_data = json.loads(response.content.decode('utf-8')) if news_data: for news in news_data['NewsCategories'][0]['2020']: news_dict = Helper.get_news_dict() title = news['title'] if 'title' in news else '' url = "https://www.broadcom.com/"+str(news['_url_']) if '_url_' in news else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData(self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news['PublishDate'] if 'PublishDate' in news else '' publish_date = Helper.parse_date(publish_date_data) content_type = news['content_type'] if 'content_type' in news else '' cid = news['content_id'] if 'content_id' in news else '' final_url = "https://www.broadcom.com/api/getjsonbyurl?vanityurl={url}&locale=avg_en&updateddate=&ctype={content_type}&cid={cid}".format( url=url, content_type=content_type, cid=cid) url_response = crawler.MakeRequest(final_url, 'Get', postData=self.body, headers=self.headers) url_json = json.loads(url_response.content.decode('utf-8')) url_soup = BeautifulSoup(url_json['Body'], 'html.parser') description = [] regex = re.compile(r'[\n\xa0]') for desc in url_soup.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "brodcom", "ticker": "brodcom_scrapped", "industry_name": "brodcom", "news_provider": "brodcom"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() else: print("All news has been scrapped !!") except Exception as e: self.logger.error(f"Error Occured : \n",exc_info=True)
def crawler_news(self): loop = True while loop: response = crawler.MakeRequest(self.url, 'Post', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False, 'omron_news') news_data = json.loads(response.content.decode('utf-8')) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title = news['Title'] if 'Title' in news else '' publish_date_data = news[ 'EntryDate'] if 'EntryDate' in news else '' publish_date = Helper.parse_date(publish_date_data) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "omron", "ticker": "omron_scrapped", "industry_name": "omron", "news_provider": "omron" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'omron_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() self.body['StartRange'] += 1 else: print("News Not Found") loop = False
def crawler_news(self): loop = True while loop: response = requests.request("GET", self.url, headers=self.headers, params=self.body) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False,'te_news') news_data = soup.find_all('div', {'class': "listing-single"}) next_page_data = soup.find_all('a',{'class':'next disabled'}) if news_data: for news in news_data[:1]: news_dict = Helper.get_news_dict() title_data = news.find('h3') title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.te.com"+str(url_data['href']) if url_data else '' publish_date_data = news.find('p',{'class':'resource-date'}) publish_date = Helper.parse_date(publish_date_data.text) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find('div',{'class':"content-area rte-output"}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description= ''.join(description) news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "te", "ticker": "te_scrapped", "industry_name": "te", "news_provider": "te"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,'te_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() if next_page_data: loop = False else: self.body['page'] += 1 else: print("News Not Found")
def crawler_news(self): try: response = crawler.MakeRequest(self.url,'Get',postData=self.body,headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection) news_data = json.loads(response.content.decode('utf-8')) if news_data: for news in news_data['GetPressReleaseListResult']: news_dict = Helper.get_news_dict() title = news['Headline'] if 'Headline' in news else "" url = "https://news.fiveyearsout.com"+str(news['LinkToDetailPage']) if 'LinkToDetailPage' in news else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData(self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date = Helper.parse_date(news['PressReleaseDate']) if 'PressReleaseDate' in news else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find('div',{'class':"module_body"}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description= ''.join(description) news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "arrow", "ticker": "arrow_scrapped", "industry_name": "arrow", "news_provider": "arrow"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() else: print("All news has been scrapped !!") except Exception as e: self.logger.error(f"Error Occured : \n",exc_info=True)
def crawler_news(self): loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page),'Get',postData=self.body,headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False,'meggitt_news') news_data = soup.find_all('div', {'class': "news_list_item"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('h4') title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = url_data['href'] if url_data else '' publish_date_data = news.find('h5') publish_date_data.span.decompose() publish_date = Helper.parse_date(str(publish_date_data.text).split('\n')[0]) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find('div',{'class':"pf-content"}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description= ''.join(description) news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "meggitt", "ticker": "meggitt_scrapped", "industry_name": "meggitt", "news_provider": "meggitt"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,'meggitt_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() page += 1 else: print("News Not Found") loop = False
def crawler_news(self): loop = True page = 1 while loop: try: response = crawler.MakeRequest(self.url.format(page=page),'Get',postData=self.body,headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False,'hbfuller_news') news_data = soup.find_all('div', {'class': "media"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('h4',{'class':'media-heading'}) title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.hbfuller.com"+str(url_data['href']) if url_data else '' publish_date_data = news.find('div',{'class':'listing-date'}) publish_date = Helper.parse_date(str(publish_date_data.text).strip()) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find('div',{'class':'row ar-body'}).find('div',{'class':"col-xs-12 col-sm-8 col-md-9"}).find('div',{'class':'col-sm-12'}).find('div',{'style':''}) description = description_data.text.strip().split('\n') description= ''.join(description[1:]) news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "hbfuller", "ticker": "hbfuller_scrapped", "industry_name": "hbfuller", "news_provider": "hbfuller"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,'hbfuller_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() page += 1 else: print("News Not Found") loop = False except AttributeError as e: print("News Not Found") loop = False
def crawler_news(self): loop = True page = 7 while loop: response = crawler.MakeRequest(self.url.format(page=page),'Get',postData=self.body) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False,'xinyiglass_news') news_list = soup.find('div',{'class':'NewsList'}) if news_list: news_data = news_list.find_all('li') if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('div',{'class':'title'}) title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.xinyiglass.com/"+str(url_data['href']) if url_data else '' regex = re.compile(r'[\n\r\t]') description_data = news.find('div',{'class':'info'}) description = regex.sub("", description_data.text) if description_data else '' date = news.find('span') year_month = news.find('em') publish_date = Helper.parse_date((year_month.text)+"-"+str(date.text)) if date and year_month else '' news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description.strip(), "text": description.strip(), "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "xinyiglass", "ticker": "xinyiglass_scrapped", "industry_name": "xinyiglass", "news_provider": "xinyiglass" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,'xinyiglass_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() page += 1 else: print("News Not Found") loop = False else: print("News Not Found") loop = False
def crawler_news(self): response = crawler.MakeRequest(self.url, 'Post', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False, 'roche_news') news_data = soup.find_all( 'article', {'class': "teaser img-float img-small clearfix"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('h3') title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.roche.com" + str( url_data['href']) if url_data else '' description_data = news.find('p') description = description_data.text if description_data else '' publish_date_data = news.find('time') publish_date_data.span.decompose() publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' news_dict.update({ "title": title, "url": url, "formatted_sub_header": title, "description": description, "link": url, "publishedAt": publish_date, 'date': publish_date, "news_provider": "roche" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'roche_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute()
def crawler_news(self): try: response = crawler.MakeRequest(self.url,'Post',postData=self.body,headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection) news_data = soup.find_all('article', {'class': "teaser img-float img-small clearfix"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('h3') title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.roche.com"+str(url_data['href']) if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData(self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue description_data = news.find('p') description = description_data.text if description_data else '' publish_date_data = news.find('time') publish_date_data.span.decompose() publish_date = Helper.parse_date(publish_date_data.text) if publish_date_data and publish_date_data.text != '' else '' news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "roche", "ticker": "roche_scrapped", "industry_name": "roche", "news_provider": "roche"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() except Exception as e: self.logger.error(f"Error Occured : \n",exc_info=True)
def crawler_news(self): response = crawler.MakeRequest(self.url,'Get',postData=self.body,headers=self.headers) bulk_obj = DbOperations.Get_object_for_bulkop(False,'brodcom_news') news_data = json.loads(response.content.decode('utf-8')) if news_data: for news in news_data['NewsCategories'][0]['2020']: news_dict = Helper.get_news_dict() title = news['title'] if 'title' in news else '' url = news['_url_'] if '_url_' in news else '' link = "https://www.broadcom.com/"+str(news['_url_']) if '_url_' in news else '' publish_date_data = news['PublishDate'] if 'PublishDate' in news else '' publish_date = Helper.parse_date(publish_date_data) content_type = news['content_type'] if 'content_type' in news else '' cid = news['content_id'] if 'content_id' in news else '' final_url = "https://www.broadcom.com/api/getjsonbyurl?vanityurl={url}&locale=avg_en&updateddate=&ctype={content_type}&cid={cid}".format( url=url, content_type=content_type, cid=cid) url_response = crawler.MakeRequest(final_url, 'Get', postData=self.body, headers=self.headers) url_json = json.loads(url_response.content.decode('utf-8')) url_soup = BeautifulSoup(url_json['Body'], 'html.parser') description = [] regex = re.compile(r'[\n\xa0]') for desc in url_soup.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": link, "link": link, "news_url_uid": hashlib.md5(link.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "brodcom", "ticker": "brodcom_scrapped", "industry_name": "brodcom", "news_provider": "brodcom"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,'brodcom_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() else: print("News Not Found")
def crawler_news(self): response = crawler.MakeRequest(self.url,'Get',postData=self.body,headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False,'canon_news') news_data = soup.find('div', {'id': "newsRelease"}) if news_data: for news in news_data.find_all('div',{'class':'grid-sizer grid-item news--item'}): news_dict = Helper.get_news_dict() title_data = news.find('h4') title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://global.canon/"+str(url_data['href']) if url_data else '' publish_date_data = news.find('div',{'class':'news-date'}) publish_date = Helper.parse_date(publish_date_data.text) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find('div',{'id':'news-detail'}).find_all('div',{'class':"col-row"})[1] description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description= ''.join(description) news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "canon", "ticker": "canon_scrapped", "industry_name": "canon", "news_provider": "canon"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,'canon_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() # else: print("News Not Found") loop = False
def crawler_news(self): loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page),'Get',postData=self.body,headers=self.headers) bulk_obj = DbOperations.Get_object_for_bulkop(False,'tereos_news') news_data = json.loads(response.content.decode('utf-8')) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title = news['title']['rendered'] if 'title' in news and 'rendered' in news['title'] else '' url = news['link'] if 'link' in news else '' publish_date_data = news['date_gmt'] if 'date_gmt' in news else '' publish_date = Helper.parse_date(publish_date_data) description_data = BeautifulSoup(news['acf']['sections'][0]['text'], 'html.parser') description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "tereos", "ticker": "tereos_scrapped", "industry_name": "tereos", "news_provider": "tereos"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,'tereos_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() page += 1 else: print("News Not Found") loop = False
def crawler_news(self): response = crawler.MakeRequest(self.url,'Get',postData=self.body,headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False,'ahlstrom_munksjo_news') news_data = soup.find('section', {'class': "content-area"}) if news_data: for news in news_data.find_all('p'): news_dict = Helper.get_news_dict() title_data = news.find('a') title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.ahlstrom-munksjo.com"+str(url_data['href']) if url_data else '' news.a.decompose() publish_date = Helper.parse_date(news.text.strip()) if news and news.text != '' else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find('section',{'class':"content-area press-release-area "}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description= ''.join(description) news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "ahlstrom_munksjo", "ticker": "ahlstrom_munksjo_scrapped", "industry_name": "ahlstrom_munksjo", "news_provider": "ahlstrom_munksjo"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,'ahlstrom_munksjo_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() else: print("News Not Found")
def crawler_news(self): loop = True count = 25 while loop: response = crawler.MakeRequest(self.url.format(count=count), 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False, 'adp_news') news_data = soup.find_all('li', {'class': "wd_item"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('div', {'class', 'wd_title'}) title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = url_data['href'] if url_data else '' description_data = news.find('div', {'class': 'wd_summary'}) description = description_data.text if description_data else '' publish_date_data = news.find('div', {'class': 'wd_date'}) publish_date = Helper.parse_date(publish_date_data.text) if publish_date_data and publish_date_data.text != '' else '' news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "adp", "ticker": "adp_scrapped", "industry_name": "adp", "news_provider": "adp"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False, 'adp_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() count += 25 else: print("News Not Found") loop = False
def crawler_news(self): response = crawler.MakeRequest(self.url,'Get',postData=self.body,headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False,'trimble_news') news_data = soup.find('div',{'class':'newslist'}).find_all('div', {'class': "newsDate"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title = news.find_next_sibling().text if news.find_next_sibling() else '' url = "https://www.trimble.com"+str(news.find_next_sibling()['href']) if news.find_next_sibling() else '' publish_date = Helper.parse_date(news.text) if news and news.text != '' else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find('div', {'class': "body"}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('div'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "trimble", "ticker": "trimble_scrapped", "industry_name": "trimble", "news_provider": "trimble"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,'trimble_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() else: print("News Not Found")
def crawler_news(self): response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False, 'ppg_news') news_data = json.loads(response.content.decode('utf-8')) if news_data: for news in news_data['GetPressReleaseListResult']: news_dict = Helper.get_news_dict() title = news['Headline'] if 'Headline' in news else "" url = news['LinkToUrl'] if 'LinkToUrl' in news else "" description = news['ShortBody'] if 'ShortBody' in news else "" news_url_uid = news[ 'PressReleaseId'] if 'PressReleaseId' in news else "" publish_date = Helper.parse_date( news['PressReleaseDate'] ) if 'PressReleaseDate' in news else "" news_dict.update({ "title": title, "url": url, "formatted_sub_header": title, "description": description, "link": url, "publishedAt": publish_date, 'date': publish_date, "news_url_uid": news_url_uid, "news_provider": "ppg" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'ppg_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute()
def crawler_news(self): loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page),'Get',postData=self.body) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False,'exxonmobil_corporation_news') news_data = soup.find_all('div', {'class': "contentCollection--item"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('a') title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://corporate.exxonmobil.com"+str(url_data['href']) if url_data else '' description_data = news.find('span',{'class':'contentCollection--description p'}) description = description_data.text if description_data else '' publish_date_data = news.find('span',{'class':'date'}) publish_date = Helper.parse_date(publish_date_data.text) if publish_date_data and publish_date_data.text != '' else '' news_dict.update( {"title": title, "url": url, "formatted_sub_header": title, "description": description, "link": url, "publishedAt":publish_date,'date':publish_date,"news_provider": "exxonmobil corporation"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,'exxonmobil_corporation_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() page += 1 else: print("News Not Found") loop = False
def crawler_news(self): response = crawler.MakeRequest(self.url,'Get',postData=self.body,headers=self.headers) bulk_obj = DbOperations.Get_object_for_bulkop(False,'zeiss_news') news_data = json.loads(response.content.decode('utf-8')) if news_data: for news_id,news in news_data['elements'].items(): news_dict = Helper.get_news_dict() title = news['title'].strip() if 'title' in news else '' url = "https://www.zeiss.com"+str(news['item_link']) if 'item_link' in news else '' publish_date_data = news['date'] if 'date' in news else '' publish_date = Helper.parse_date(publish_date_data) if publish_date_data != '' else '' description = news['description'] if 'description' in news else '' news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "zeiss", "ticker": "zeiss_scrapped", "industry_name": "zeiss", "news_provider": "zeiss"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,'zeiss_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() else: print("News Not Found")
def crawler_news(self): response = crawler.MakeRequest(self.url,'Get',postData=self.body,headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False,'ams_news') news_data = soup.find_all('div', {'class': "col-md-4 col-sm-8 col-sm-offset-2 col-md-offset-0"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('h3',{"class":"info-box__title"}) title = title_data.text.strip() if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.ams.com"+str(url_data['href']) if url_data else '' description_data = news.find('div',{'class':"info-box__content"}).find('p') description = description_data.text if description_data else '' publish_date_data = news.find('p',{"class":"info-box__date"}) publish_date = Helper.parse_date(publish_date_data.text) if publish_date_data and publish_date_data.text != '' else '' news_dict.update( {"title": title,"news_title_uid":hashlib.md5(title.encode()).hexdigest(), "url": url,"link": url,"news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description,"text":description, "publishedAt":publish_date,'date':publish_date,"publishedAt_scrapped":publish_date, "company_id":"ams","ticker":"ams_scrapped","industry_name":"ams","news_provider": "ams"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,'ams_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute()
def crawler_news(self): response = crawler.MakeRequest(self.url,'Get',postData=self.body,headers=self.headers) soup = BeautifulSoup(response.content, 'xml') bulk_obj = DbOperations.Get_object_for_bulkop(False,'nec_news') news_data = soup.find_all('item') if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('title') title = title_data.text if title_data else "" url_data = news.find('link') url = url_data.text if url_data else '' description_data = news.find('description') description = description_data.text if description_data else '' publish_date_data = news.find('pubDate') publish_date = Helper.parse_date(publish_date_data.text) if publish_date_data and publish_date_data.text != '' else '' news_dict.update( {"title": title,"news_title_uid":hashlib.md5(title.encode()).hexdigest(), "url": url,"link": url,"news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description,"text":description, "publishedAt":publish_date,'date':publish_date,"publishedAt_scrapped":publish_date, "company_id":"nec","ticker":"nec_scrapped","industry_name":"nec","news_provider": "nec"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,'nec_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute()
def crawler_news(self): try: loop = True offset = 0 while loop: bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) response = crawler.MakeRequest( self.url, 'Post', postData=self.body.format(off_set=offset), headers=self.headers) if response is not None: news_data = json.loads(response.content.decode('utf-8')) if news_data.__contains__( 'count') and news_data['count'] > 0: for news in news_data['pages']['items']: print(news) date = Helper.parse_date(news['news_date']) if date: if date.year < datetime.datetime.now().year: break url = "https://www.infineon.com/" + news['url'] # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue news_dict = Helper.get_news_dict() description = self.fetchDescription( "https://www.infineon.com/" + news['url']) news_dict.update({ "date": Helper.parse_date(news['news_date']), "news_provider": "Infineon", "url": "https://www.infineon.com/" + news['url'], "formatted_sub_header": "", "publishedAt": Helper.parse_date(news['news_date']), "description": description, "title": news['title'], "ticker": "Infineon_scrapped", "industry_name": "Infineon", "news_title_uid": hashlib.md5( news['title'].encode()).hexdigest(), "link": "https://www.infineon.com/" + news['url'], "text": description, "company_id": "Infineon", "news_url_uid": hashlib.md5( ("https://www.infineon.com/" + news['url']).encode()).hexdigest() }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 1: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) else: print("No data found") loop = False offset += 10 if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() else: break except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) if response.headers[ 'Content-Type'] == 'application/json; charset=utf-8': response_json = json.loads(response.content.decode('utf-8')) else: print("No data found") break soup = BeautifulSoup(response_json['html'], 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False, 'audi_news') news_data = soup.find_all('li', { 'class': 'page-list--item is-detailed infinite-nodes--list-item' }) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('h3') title = title_data.text.strip() if title_data else "" url_data = news.find('a', {'href': True}) url = url_data['href'] if url_data else '' publish_date_data = news.find('div', {'class': 'meta--item'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' description_data = news.find('div', {'class': "page-list--text"}) description = description_data.text.strip( ) if description_data else '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "audi", "ticker": "audi_scrapped", "industry_name": "audi", "news_provider": "audi" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'audi_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("News Not Found") loop = False
def crawler_news(self): loop = True count = 0 while loop: response = crawler.MakeRequest(self.url.format(count=count), 'Get', postData=self.body, headers=self.header) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, 'martin_corporation_news') news_data = soup.find_all('li', {'class': "wd_item"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('div', {'class', 'wd_title'}) title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = url_data['href'] if url_data else '' description_data = news.find('div', {'class': 'wd_subtitle'}) description = description_data.text if description_data else '' text_data = news.find('div', {'class': 'wd_summary'}) text = text_data.text if text_data else '' publish_date_data = news.find('div', {'class': 'wd_date'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' news_dict.update({ "title": title, "url": url, "formatted_sub_header": title, "description": description, "link": url, "publishedAt": publish_date, 'date': publish_date, "news_provider": "martin corporation", "text": text }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'martin_corporation_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() count += 100 else: print("News Not Found") loop = False
def crawler_news(self): try: response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find( 'div', {'class': "content-listing__items glide__slides"}) if news_data: for news in news_data.find_all( 'a', {'class': 'content-listing__item glide__slide col-lg-3'}): news_dict = Helper.get_news_dict() regex = re.compile(r'[\r\n\xa0]') title_data = news.find('h3') title = regex.sub("", str( title_data.text.strip())) if title_data else "" url = "https://www.enersys.com" + str(news['href']) # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find( 'p', {'class': 'content-listing__item-date'}) publish_date = Helper.parse_date( publish_date_data.text.strip() ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find( 'div', {'class': "standard-page__body"}) description = [] for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "enersys", "ticker": "enersys_scrapped", "industry_name": "enersys", "news_provider": "enersys" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() else: print("All news has been scrapped !!") except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False, 'arrow_news') news_data = json.loads(response.content.decode('utf-8')) if news_data: for news in news_data['GetPressReleaseListResult']: news_dict = Helper.get_news_dict() title = news['Headline'] if 'Headline' in news else "" url = "https://news.fiveyearsout.com" + str( news['LinkToDetailPage'] ) if 'LinkToDetailPage' in news else '' publish_date = Helper.parse_date( news['PressReleaseDate'] ) if 'PressReleaseDate' in news else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find('div', {'class': "module_body"}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "arrow", "ticker": "arrow_scrapped", "industry_name": "arrow", "news_provider": "arrow" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'arrow_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() else: print("News Not Found")
def crawler_news(self): try: loop = True page = 0 while loop: response = crawler.MakeRequest( self.url, 'Post', postData=self.body.format(page=page), headers=self.headers) bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = json.loads(response.content.decode('utf-8')) if news_data['News']: for news in news_data['News']: news_dict = Helper.get_news_dict() title = news['Title'] if 'Title' in news else '' url = "https://www.novozymes.com" + str( news['Url']) if 'Url' in news else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news[ 'CreationDate'] if 'CreationDate' in news else '' publish_date = Helper.parse_date(publish_date_data) description = news[ 'Content'] if 'Content' in news else '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "novozymes", "ticker": "novozymes_scrapped", "industry_name": "novozymes", "news_provider": "novozymes" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) if (response.status_code == 200): soup = BeautifulSoup(response.content, 'html.parser') else: break bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find('div', { 'class': "x-main full" }).find_all('div', {'class': 'x-container max width'}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('h2', {'class': 'entry-title'}) title = title_data.text.strip() if title_data else "" url_data = news.find('a', {'href': True}) url = url_data['href'] if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find('time', {'class': 'entry-date'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest( url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find( 'div', {'class': "entry-content content"}) description = [] regex = re.compile(r'[\n\xa0]') if description_data.h2 != None: for desc in description_data.h2.find_all_previous( "p")[::-1]: description.append( regex.sub("", str(desc.text))) else: for desc in description_data.find_all('p'): description.append( regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "titanx", "ticker": "titanx_scrapped", "industry_name": "titanx", "news_provider": "titanx" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find('tbody') if news_data and news_data.tr.text.strip( ) != 'There is no data.': for news in news_data.find_all('tr'): news_dict = Helper.get_news_dict() title_data = news.find('td', {'class': 'title'}) title = title_data.text if title_data else "" # Check if already present unqUrl = hashlib.md5(title.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_title_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. title - ( " + title + " )") continue publish_date_data = news.find_all('td')[3].text publish_date = Helper.parse_date( publish_date_data ) if publish_date_data and publish_date_data != '' else '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "hanwhacorp", "ticker": "hanwhacorp_scrapped", "industry_name": "hanwhacorp", "news_provider": "hanwhacorp" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)