def crawler_news(self): try: response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = json.loads(response.content.decode('utf-8')) if news_data: for news in news_data['GetPressReleaseListResult']: news_dict = Helper.get_news_dict() title = news['Headline'] if 'Headline' in news else "" url = news['LinkToUrl'] if 'LinkToUrl' in news else "" # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue description = news[ 'ShortBody'] if 'ShortBody' in news else "" news_url_uid = news[ 'PressReleaseId'] if 'PressReleaseId' in news else "" publish_date = Helper.parse_date( news['PressReleaseDate'] ) if 'PressReleaseDate' in news else "" news_dict.update({ "title": title, "url": url, "formatted_sub_header": title, "description": description, "link": url, "publishedAt": publish_date, 'date': publish_date, "news_url_uid": news_url_uid, "news_provider": "ppg" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def processNewsBasedOnTitle(news_collection, processed_collection, company): isInserted = 0 rowCount = 0 for row in DbOperations.GetData(news_collection, { "is_used": { '$exists': False }, "news_provider": company }, {}): try: DbOperations.InsertIntoMongo(processed_collection, row) isInserted = 1 print('Success in inserting Process collection => [title: "' + row['title'] + '"]') DbOperations.Update_oneMongo( news_collection, {"news_title_uid": row['news_title_uid']}, {"$set": { "is_used": 1 }}) rowCount = rowCount + 1 except Exception as e: print( 'Error in inserting Process collection => [title: "' + row['title'] + '"]', e) pass return isInserted, rowCount
def crawler_news(self): try: loop = True count = 0 while loop: response = crawler.MakeRequest(self.url.format(count=count),'Get',postData=self.body,headers=self.header) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection) news_data = soup.find_all('li', {'class': "wd_item"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('div',{'class','wd_title'}) title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = url_data['href'] if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData(self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue description_data = news.find('div',{'class':'wd_subtitle'}) description = description_data.text if description_data else '' text_data = news.find('div', {'class': 'wd_summary'}) text = text_data.text if text_data else '' publish_date_data = news.find('div',{'class':'wd_date'}) publish_date = Helper.parse_date(publish_date_data.text) if publish_date_data and publish_date_data.text != '' else '' news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "martin_corporation", "ticker": "martin_corporation_scrapped", "industry_name": "martin_corporation", "news_provider": "martin_corporation"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() count += 100 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n",exc_info=True)
def crawler(self): try: response = crawler.MakeRequest(self.url, "Get") soup = BeautifulSoup(response.content, "html.parser") data = [] boxs = soup.find_all("div", {"class": 'news-box span3 left'}) for box in boxs: datadict = Helper.get_news_dict() url = "https://www.pemex.com" + box.find("a")['href'] # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue datadict.update( {"url": "https://www.pemex.com" + box.find("a")['href']}) description = self.fetchDescription("https://www.pemex.com" + box.find("a")['href']) datadict.update({ "date": box.find("p", { "class": "news-meta news-date" }).text, "news_provider": "pemex", "formatted_sub_header": box.find("div", { "class": "ms-WPBody h2" }).text, "publishedAt": Helper.parse_date( box.find("p", { "class": "news-meta news-date" }).text), "description": description, "title": box.find("div", { "class": "ms-WPBody h2" }).text, "link": self.url, "text": description, "company_id": "pemex", "news_url_uid": hashlib.md5(("https://www.pemex.com" + box.find("a")['href']).encode()).hexdigest() }) data.append(datadict) DbOperations.InsertIntoMongo(self.news_collection, data) except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: response = crawler.MakeRequest(self.url,'Get',postData=self.body,headers=self.headers) bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection) news_data = json.loads(response.content.decode('utf-8')) if news_data: for news in news_data['NewsCategories'][0]['2020']: news_dict = Helper.get_news_dict() title = news['title'] if 'title' in news else '' url = "https://www.broadcom.com/"+str(news['_url_']) if '_url_' in news else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData(self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news['PublishDate'] if 'PublishDate' in news else '' publish_date = Helper.parse_date(publish_date_data) content_type = news['content_type'] if 'content_type' in news else '' cid = news['content_id'] if 'content_id' in news else '' final_url = "https://www.broadcom.com/api/getjsonbyurl?vanityurl={url}&locale=avg_en&updateddate=&ctype={content_type}&cid={cid}".format( url=url, content_type=content_type, cid=cid) url_response = crawler.MakeRequest(final_url, 'Get', postData=self.body, headers=self.headers) url_json = json.loads(url_response.content.decode('utf-8')) url_soup = BeautifulSoup(url_json['Body'], 'html.parser') description = [] regex = re.compile(r'[\n\xa0]') for desc in url_soup.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "brodcom", "ticker": "brodcom_scrapped", "industry_name": "brodcom", "news_provider": "brodcom"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() else: print("All news has been scrapped !!") except Exception as e: self.logger.error(f"Error Occured : \n",exc_info=True)
def crawler_news(self): try: response = crawler.MakeRequest(self.url,'Get',postData=self.body,headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection) news_data = json.loads(response.content.decode('utf-8')) if news_data: for news in news_data['GetPressReleaseListResult']: news_dict = Helper.get_news_dict() title = news['Headline'] if 'Headline' in news else "" url = "https://news.fiveyearsout.com"+str(news['LinkToDetailPage']) if 'LinkToDetailPage' in news else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData(self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date = Helper.parse_date(news['PressReleaseDate']) if 'PressReleaseDate' in news else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find('div',{'class':"module_body"}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description= ''.join(description) news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "arrow", "ticker": "arrow_scrapped", "industry_name": "arrow", "news_provider": "arrow"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() else: print("All news has been scrapped !!") except Exception as e: self.logger.error(f"Error Occured : \n",exc_info=True)
def crawler_news(self): try: response = crawler.MakeRequest(self.url,'Post',postData=self.body,headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection) news_data = soup.find_all('article', {'class': "teaser img-float img-small clearfix"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('h3') title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.roche.com"+str(url_data['href']) if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData(self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue description_data = news.find('p') description = description_data.text if description_data else '' publish_date_data = news.find('time') publish_date_data.span.decompose() publish_date = Helper.parse_date(publish_date_data.text) if publish_date_data and publish_date_data.text != '' else '' news_dict.update( {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "roche", "ticker": "roche_scrapped", "industry_name": "roche", "news_provider": "roche"}) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() except Exception as e: self.logger.error(f"Error Occured : \n",exc_info=True)
def crawler_news(self): try: loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) if (response.status_code == 200): soup = BeautifulSoup(response.content, 'html.parser') else: break bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find('div', { 'class': "x-main full" }).find_all('div', {'class': 'x-container max width'}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('h2', {'class': 'entry-title'}) title = title_data.text.strip() if title_data else "" url_data = news.find('a', {'href': True}) url = url_data['href'] if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find('time', {'class': 'entry-date'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest( url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find( 'div', {'class': "entry-content content"}) description = [] regex = re.compile(r'[\n\xa0]') if description_data.h2 != None: for desc in description_data.h2.find_all_previous( "p")[::-1]: description.append( regex.sub("", str(desc.text))) else: for desc in description_data.find_all('p'): description.append( regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "titanx", "ticker": "titanx_scrapped", "industry_name": "titanx", "news_provider": "titanx" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler(self): try: counter = 1 data = [] while True: response = crawler.MakeRequest(self.url, "Get") soup = BeautifulSoup(response.content, "html.parser") if response.status_code == 200: boxs = soup.find_all("div", {"class": 'item'}) for box in boxs: date = Helper.parse_date( box.find("p", { "class": "fade" }).text) if date: if date.year < datetime.datetime.now().year: break url = "https://www.bd.com/" + box.find("a")['href'] # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue datadict = Helper.get_news_dict() datadict.update({ "url": "https://www.bd.com/" + box.find("a")['href'] }) description = self.fetchDescription( "https://www.bd.com/" + box.find("a")['href']) datadict.update({ "date": Helper.parse_date( box.find("p", { "class": "fade" }).text), "news_provider": "Becton, Dickinson and Company", "formatted_sub_header": box.find("a").text.strip(), "publishedAt": Helper.parse_date( box.find("p", { "class": "fade" }).text), "description": description, "title": box.find("a").text.strip(), "news_title_uid": hashlib.md5(box.find( "a").text.strip().encode()).hexdigest(), "link": url, "text": description, "ticker": "bd_scrapped", "industry_name": "Becton, Dickinson and Company", "company_id": "Becton, Dickinson and Company", "news_url_uid": hashlib.md5(url.encode()).hexdigest() }) data.append(datadict) else: break DbOperations.InsertIntoMongo(self.news_collection, data) except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find('tbody') if news_data and news_data.tr.text.strip( ) != 'There is no data.': for news in news_data.find_all('tr'): news_dict = Helper.get_news_dict() title_data = news.find('td', {'class': 'title'}) title = title_data.text if title_data else "" # Check if already present unqUrl = hashlib.md5(title.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_title_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. title - ( " + title + " )") continue publish_date_data = news.find_all('td')[3].text publish_date = Helper.parse_date( publish_date_data ) if publish_date_data and publish_date_data != '' else '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "hanwhacorp", "ticker": "hanwhacorp_scrapped", "industry_name": "hanwhacorp", "news_provider": "hanwhacorp" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: loop = True while loop: response = crawler.MakeRequest(self.url, 'Post', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = json.loads(response.content.decode('utf-8')) if news_data: for news in news_data[:1]: news_dict = Helper.get_news_dict() title = news['Title'] if 'Title' in news else '' # Check if already present unqUrl = hashlib.md5(title.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_title_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. title - ( " + title + " )") continue publish_date_data = news[ 'EntryDate'] if 'EntryDate' in news else '' publish_date = Helper.parse_date(publish_date_data) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "omron", "ticker": "omron_scrapped", "industry_name": "omron", "news_provider": "omron" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() self.body['StartRange'] += 1 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = json.loads(response.content.decode('utf-8')) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title = news['title'][ 'rendered'] if 'title' in news and 'rendered' in news[ 'title'] else '' url = news['link'] if 'link' in news else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news[ 'date_gmt'] if 'date_gmt' in news else '' publish_date = Helper.parse_date(publish_date_data) description_data = BeautifulSoup( news['acf']['sections'][0]['text'], 'html.parser') description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "tereos", "ticker": "tereos_scrapped", "industry_name": "tereos", "news_provider": "tereos" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) if response.headers[ 'Content-Type'] == 'application/json; charset=utf-8': response_json = json.loads( response.content.decode('utf-8')) else: print("No data found") break soup = BeautifulSoup(response_json['html'], 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find_all( 'li', { 'class': 'page-list--item is-detailed infinite-nodes--list-item' }) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('h3') title = title_data.text.strip() if title_data else "" url_data = news.find('a', {'href': True}) url = url_data['href'] if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find('div', {'class': 'meta--item'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' description_data = news.find( 'div', {'class': "page-list--text"}) description = description_data.text.strip( ) if description_data else '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "audi", "ticker": "audi_scrapped", "industry_name": "audi", "news_provider": "audi" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find('section', {'class': "cmp-list news-list"}) if news_data: if news_data.find_all('div', {'class', 'col-12'}): for news in news_data.find_all('div', {'class', 'col-12'}): news_dict = Helper.get_news_dict() title_data = news.find( 'div', {'class': "search-result-title"}) title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.lonza.com" + str( url_data['href']) if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find( 'div', {'class': 'search-result-label'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest( url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find( 'section', {'class': "cmp-news-listing"}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('li'): description.append( regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "lonza", "ticker": "lonza_scrapped", "industry_name": "lonza", "news_provider": "lonza" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find('div', {'id': "contents"}) if news_data: for news in news_data.find_all('dt', {'class': 'mgnT15'}): news_dict = Helper.get_news_dict() title_data = news.find_next_sibling().a title = title_data.text if title_data else "" url_data = news.find_next_sibling().a url = "https://www.toray.in/india/news/" + str( url_data['href']) if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date = Helper.parse_date( str(news.text).split('\n') [0]) if news and news.text != '' else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find_all( 'p', {'class': "mgnB20"}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data: description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "toray", "ticker": "toray_scrapped", "industry_name": "toray", "news_provider": "toray" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) article_data = soup.find_all('div', {'class': 'article'}) if article_data: for article in article_data: news_data = article.find_all('section', {'class': ""}) for news in news_data[1::2]: news_dict = Helper.get_news_dict() title_data = news.find('h2') title = title_data.text if title_data else "" url = news.find_next_sibling( ).a['href'] if news.find_next_sibling() else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue description_data = news.find('p') description = description_data.text if description_data else '' url_response = crawler.MakeRequest( url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') publish_date_data = url_soup.find( 'p', {'class': 'meta large inline'}) publish_date = Helper.parse_date( publish_date_data.text.replace('|', "").strip() ) if publish_date_data and publish_date_data.text != '' else '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "voestalpine", "ticker": "voestalpine_scrapped", "industry_name": "voestalpine", "news_provider": "voestalpine" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: loop = True page = 0 while loop: response = crawler.MakeRequest( self.url, 'Post', postData=self.body.format(page=page), headers=self.headers) bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = json.loads(response.content.decode('utf-8')) if news_data['News']: for news in news_data['News']: news_dict = Helper.get_news_dict() title = news['Title'] if 'Title' in news else '' url = "https://www.novozymes.com" + str( news['Url']) if 'Url' in news else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news[ 'CreationDate'] if 'CreationDate' in news else '' publish_date = Helper.parse_date(publish_date_data) description = news[ 'Content'] if 'Content' in news else '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "novozymes", "ticker": "novozymes_scrapped", "industry_name": "novozymes", "news_provider": "novozymes" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler(self): try: page = 1 bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) while True: response = crawler.MakeRequest(self.url.format(page=page), "Get", headers=self.headers) if 'we did not find any results related' in response.text: break soup = BeautifulSoup(response.content, "html.parser") boxs = soup.find_all("li", {"class": 'utc-cards--item'}) for box in boxs: date = box.find("time", {"class": "utc-card--date"}).text if date: date = Helper.parse_date(date) if date.year < datetime.datetime.now().year: break datadict = Helper.get_news_dict() datadict.update( {"url": "https://www.rtx.com" + box.find("a")['href']}) description = self.fetchDescription("https://www.rtx.com" + box.find("a")['href']) url = "https://www.rtx.com" + box.find("a")['href'] # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue datadict.update({ "date": date, "news_provider": "UNITED TECHNOLOGIES CORPORATION", "formatted_sub_header": box.find("a").text, "publishedAt": date, "description": description, "title": box.find("a").text, "link": "https://www.rtx.com" + box.find("a")['href'], "text": description, "company_id": "rtx", "news_url_uid": hashlib.md5( ("https://www.rtx.com" + box.find("a")['href']).encode()).hexdigest() }) bulk_obj.insert(datadict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find('div', {'id': 'tab_news_release'}) if news_data: for news in news_data.find_all('dt'): news_dict = Helper.get_news_dict() title_data = news.find_next_sibling().find_next_sibling().a title = title_data.text if title_data else "" url_data = news.find_next_sibling().find_next_sibling().a url = url_data['href'] if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.text if news.text != '' else '' publish_date = Helper.parse_date(publish_date_data) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "fujielectric", "ticker": "fujielectric_scrapped", "industry_name": "fujielectric", "news_provider": "fujielectric" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() else: print("All news has been scrapped !!") except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: loop = True page = 1 while loop: try: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find_all('div', {'class': "media"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('h4', {'class': 'media-heading'}) title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.hbfuller.com" + str( url_data['href']) if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find( 'div', {'class': 'listing-date'}) publish_date = Helper.parse_date( str(publish_date_data.text).strip() ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest( url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find( 'div', { 'class': 'row ar-body' }).find('div', { 'class': "col-xs-12 col-sm-8 col-md-9" }).find('div', { 'class': 'col-sm-12' }).find('div', {'style': ''}) description = description_data.text.strip().split( '\n') description = ''.join(description[1:]) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "hbfuller", "ticker": "hbfuller_scrapped", "industry_name": "hbfuller", "news_provider": "hbfuller" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False except AttributeError as e: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler(self): try: bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) response = crawler.MakeRequest(self.url, "Get") soup = BeautifulSoup(response.content, "html.parser") boxs = soup.find_all("div", {"class": 'listCol sort-item news-item'}) for box in boxs: datadict = Helper.get_news_dict() datadict.update({ "url": "https://www.adlinktech.com" + box.find("a")['href'] }) url = "https://www.adlinktech.com" + box.find("a")['href'] # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue date, description = self.fetchDescription( "https://www.adlinktech.com" + box.find("a")['href']) datadict.update({ "date": Helper.parse_date(date), "news_provider": "adlink", "formatted_sub_header": box.find("div", { "class": "contentText" }).text, "publishedAt": Helper.parse_date(date), "description": description, "title": box.find("div", { "class": "contentText" }).text, "link": "https://www.adlinktech.com" + box.find("a")['href'], "ticker": "adlink_scrapped", "industry_name": "adlink", "news_title_uid": hashlib.md5( box.find("div", { "class": "contentText" }).text.encode()).hexdigest(), "text": description, "company_id": "adlink", "news_url_uid": hashlib.md5(("https://www.adlinktech.com" + box.find("a")['href']).encode()).hexdigest() }) bulk_obj.insert(datadict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): """ This function will scrap news page wise for given url :return: """ try: loop = True page = 0 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find_all( 'div', { 'class': "coh-column zmb-44 coh-visible-xs coh-col-xs-12 coh-visible-md coh-col-md-6 coh-col-md-push-0 coh-col-md-pull-0 coh-col-md-offset-0" }) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('a') title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.zscaler.com/" + str( url_data['href']) if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find( 'time', {'class': 'text-center bg-sea-green'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest( url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find_all( 'div', {'class': "col-sm-12"}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data: if desc.find( 'a', {'href': 'https://www.zscaler.com/'}) != None: description.append( regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "zscaler", "ticker": "zscaler_scrapped", "industry_name": "zscaler", "news_provider": "zscaler" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): """ This function will scrap news page wise for given url :return: """ try: response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find_all('li', {'class': "NewsPanel__item"}) if news_data: for news in news_data: try: news_dict = Helper.get_news_dict() title_data = news.find('div', {'class': 'NewsPanel__body'}) title = title_data.text.strip() if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.kaneka.co.jp/" + str( url_data['href']) if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find( 'time', {'class': 'NewsPanel__time'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data else '' url_response = crawler.MakeRequest( url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find( 'article', {'class': 'articleBody topics__mod'}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "kaneka", "ticker": "kaneka_scrapped", "industry_name": "kaneka", "news_provider": "kaneka" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() else: print("All news has been scrapped !!") except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find_all( 'div', class_='small-12 columns event-post-info') if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('h6') title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = str(url_data['href']) if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find('span', class_='date caps') publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find('div', {'class': ""}) description = [] regex = re.compile(r'[\n\xa0]') for string in description_data.stripped_strings: description.append(regex.sub("", str(string.strip()))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "alertlogic", "topic_name": "press-releases", "ticker": "alertlogic_scrapped", "industry_name": "alertlogic", "news_provider": "alertlogic" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() else: print("All news has been scrapped !!") except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): """ This function will scrap news page wise for given url :return: """ try: loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_list = soup.find('div', {'class': 'NewsList'}) if news_list: news_data = news_list.find_all('li') if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('div', {'class': 'title'}) title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.xinyiglass.com/" + str( url_data['href']) if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue regex = re.compile(r'[\n\r\t]') description_data = news.find( 'div', {'class': 'info'}) description = regex.sub( "", description_data.text ) if description_data else '' date = news.find('span') year_month = news.find('em') publish_date = Helper.parse_date( (year_month.text) + "-" + str(date.text)) if date and year_month else '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description.strip(), "text": description.strip(), "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "xinyiglass", "ticker": "xinyiglass_scrapped", "industry_name": "xinyiglass", "news_provider": "xinyiglass" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find( 'div', {'class': "content-listing__items glide__slides"}) if news_data: for news in news_data.find_all( 'a', {'class': 'content-listing__item glide__slide col-lg-3'}): news_dict = Helper.get_news_dict() regex = re.compile(r'[\r\n\xa0]') title_data = news.find('h3') title = regex.sub("", str( title_data.text.strip())) if title_data else "" url = "https://www.enersys.com" + str(news['href']) # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find( 'p', {'class': 'content-listing__item-date'}) publish_date = Helper.parse_date( publish_date_data.text.strip() ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find( 'div', {'class': "standard-page__body"}) description = [] for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "enersys", "ticker": "enersys_scrapped", "industry_name": "enersys", "news_provider": "enersys" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() else: print("All news has been scrapped !!") except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler(self): try: data = [] counter = 1 while True: response = crawler.MakeRequest( self.url.format(counter=counter), "Get") if response.status_code == 200: soup = BeautifulSoup(response.content, "html.parser") boxs = soup.find_all("div", {"class": 'unicom-newsListItem'}) for box in boxs: date = box.find("p", { "class": "unicom-listInformationDate" }).text if date: date = Helper.parse_date(date) if date.year < datetime.datetime.now().year: break datadict = Helper.get_news_dict() url = box.find("a")['href'] # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue datadict.update({"newsurl": box.find("a")['href']}) description = self.fetchDescription( box.find("a")['href']) datadict.update({ "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "date": box.find("p", { "class": "unicom-listInformationDate" }).text, "news_provider": "panasonic", "formatted_sub_header": box.find("h3", { "class": "unicom-newsListTitleIn" }).text, "publishedAt": date, "description": description, "title": box.find("h3", { "class": "unicom-newsListTitleIn" }).text }) data.append(datadict) counter += counter self.url = "https://news.panasonic.com/global/all/all_{counter}.html" else: break DbOperations.InsertIntoMongo(self.news_collection, data) except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: loop = True offset = 0 while loop: bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) response = crawler.MakeRequest( self.url, 'Post', postData=self.body.format(off_set=offset), headers=self.headers) if response is not None: news_data = json.loads(response.content.decode('utf-8')) if news_data.__contains__( 'count') and news_data['count'] > 0: for news in news_data['pages']['items']: print(news) date = Helper.parse_date(news['news_date']) if date: if date.year < datetime.datetime.now().year: break url = "https://www.infineon.com/" + news['url'] # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue news_dict = Helper.get_news_dict() description = self.fetchDescription( "https://www.infineon.com/" + news['url']) news_dict.update({ "date": Helper.parse_date(news['news_date']), "news_provider": "Infineon", "url": "https://www.infineon.com/" + news['url'], "formatted_sub_header": "", "publishedAt": Helper.parse_date(news['news_date']), "description": description, "title": news['title'], "ticker": "Infineon_scrapped", "industry_name": "Infineon", "news_title_uid": hashlib.md5( news['title'].encode()).hexdigest(), "link": "https://www.infineon.com/" + news['url'], "text": description, "company_id": "Infineon", "news_url_uid": hashlib.md5( ("https://www.infineon.com/" + news['url']).encode()).hexdigest() }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 1: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) else: print("No data found") loop = False offset += 10 if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() else: break except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: for page in range(0, 5): url = baseurl + str(page) self.url = url response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_body = soup.find('tbody') news_data = news_body.find_all('tr') if news_data: for news in news_data: news_dict = Helper.get_news_dict() news_header = news.find( 'td', class_= 'views-field views-field-field-nir-news-title') title_data = news_header.find('a') title = title_data.text if title_data else "" url_data = news_header.find('a', {'href': True}) url = "https://investors.accuray.com" + str( url_data['href']) if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue date_header = news.find( 'td', class_='views-field views-field-field-nir-news-date' ) publish_date_data = date_header.find('time', class_='datetime') publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest( url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find( 'div', {'class': "xn-content"}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append( regex.sub("", str(desc.text.strip()))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "accuracy", "ticker": "accuracy_scrapped", "industry_name": "accuracy", "news_provider": "accuracy" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() else: print("All news has been scrapped !!") except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)