def crawler_news(self): response = crawler.MakeRequest(self.url, 'Post', postData=self.body, headers=self.headers) news_json = json.loads(response.content.decode('utf-8')) if news_json: soup = BeautifulSoup(news_json['data'], 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, 'shinetsu_news') news_data = soup.find_all('div', {'class': "item"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('div', {'class': 'title'}) title = title_data.text.strip().split( '\n')[0] if title_data else "" url_data = news.find('a', {'href': True}) url = url_data['href'] if url_data else '' publish_date_data = news.find('p', {'class': 'date'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' if url.split('.')[-1] != 'pdf': url_response = crawler.MakeRequest( url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find( 'div', {'class': "content-news"}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) else: description = '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "shinetsu", "ticker": "shinetsu_scrapped", "industry_name": "shinetsu", "news_provider": "shinetsu" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'shinetsu_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() else: print("News Not Found")
def crawler_news(self): try: response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find('div', {'id': "newsRelease"}) if news_data: for news in news_data.find_all( 'div', {'class': 'grid-sizer grid-item news--item'}): news_dict = Helper.get_news_dict() title_data = news.find('h4') title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://global.canon/" + str( url_data['href']) if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find('div', {'class': 'news-date'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find('div', { 'id': 'news-detail' }).find_all('div', {'class': "col-row"})[1] description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "canon", "ticker": "canon_scrapped", "industry_name": "canon", "news_provider": "canon" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False, 'arrow_news') news_data = json.loads(response.content.decode('utf-8')) if news_data: for news in news_data['GetPressReleaseListResult']: news_dict = Helper.get_news_dict() title = news['Headline'] if 'Headline' in news else "" url = "https://news.fiveyearsout.com" + str( news['LinkToDetailPage'] ) if 'LinkToDetailPage' in news else '' publish_date = Helper.parse_date( news['PressReleaseDate'] ) if 'PressReleaseDate' in news else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find('div', {'class': "module_body"}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "arrow", "ticker": "arrow_scrapped", "industry_name": "arrow", "news_provider": "arrow" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'arrow_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() else: print("News Not Found")
def crawler_news(self): try: loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = json.loads(response.content.decode('utf-8')) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title = news['title'][ 'rendered'] if 'title' in news and 'rendered' in news[ 'title'] else '' url = news['link'] if 'link' in news else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news[ 'date_gmt'] if 'date_gmt' in news else '' publish_date = Helper.parse_date(publish_date_data) description_data = BeautifulSoup( news['acf']['sections'][0]['text'], 'html.parser') description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "tereos", "ticker": "tereos_scrapped", "industry_name": "tereos", "news_provider": "tereos" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) if (response.status_code == 200): soup = BeautifulSoup(response.content, 'html.parser') else: break bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find('div', { 'class': "x-main full" }).find_all('div', {'class': 'x-container max width'}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('h2', {'class': 'entry-title'}) title = title_data.text.strip() if title_data else "" url_data = news.find('a', {'href': True}) url = url_data['href'] if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find('time', {'class': 'entry-date'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest( url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find( 'div', {'class': "entry-content content"}) description = [] regex = re.compile(r'[\n\xa0]') if description_data.h2 != None: for desc in description_data.h2.find_all_previous( "p")[::-1]: description.append( regex.sub("", str(desc.text))) else: for desc in description_data.find_all('p'): description.append( regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "titanx", "ticker": "titanx_scrapped", "industry_name": "titanx", "news_provider": "titanx" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) bulk_obj = DbOperations.Get_object_for_bulkop(False, 'asahi_kasei_news') news_data = json.loads(response.content.decode('utf-8')) if news_data: for news_list in news_data[0]['2020'][1]['release'][0]['mooth']: for news in news_list['item']: news_dict = Helper.get_news_dict() title = news['text'] if 'text' in news else '' url = "https://www.asahi-kasei.com" + str( news['url']) if 'url' in news else '' publish_date_data = news['day'] if 'day' in news else '' publish_date = Helper.parse_date(publish_date_data) url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find('main', {'class': "main"}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "asahi_kasei", "ticker": "asahi_kasei_scrapped", "industry_name": "asahi_kasei", "news_provider": "asahi_kasei" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'asahi_kasei_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() else: print("News Not Found")
def crawler_news(self): loop = True page = 0 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, 'aruplab_news') news_data = soup.find_all('div', {'class': "views-col"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('h4') title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.aruplab.com" + str( url_data['href']) if url_data else '' publish_date_data = news.find( 'span', {'class': 'views-field views-field-field-date'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find('main', { 'role': "main" }).find( 'div', { 'class': 'field field--name-body field--type-text-with-summary field--label-hidden field__item' }) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "aruplab", "ticker": "aruplab_scrapped", "industry_name": "aruplab", "news_provider": "aruplab" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'aruplab_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("News Not Found") loop = False
def crawler_news(self): """ This function will scrap news page wise for given url :return: """ try: loop = True page = 0 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, 'harvardbioscience_news') news_data = soup.find('tbody') if news_data: for news in news_data.find_all('tr'): try: news_dict = Helper.get_news_dict() title_data = news.find( 'td', { 'class': 'views-field views-field-field-nir-news-title' }).find('a', {'href': True}) title = title_data.text.strip( ) if title_data else "" url_data = news.find( 'td', { 'class': 'views-field views-field-field-nir-news-title' }).find('a', {'href': True}) url = "https://investor.harvardbioscience.com" + str( url_data['href']) if url_data else '' publish_date_data = news.find( 'time', {'class': 'datetime'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data else '' url_response = crawler.MakeRequest( url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find( 'div', {'class': 'node__content'}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append( regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "harvardbioscience", "ticker": "harvardbioscience_scrapped", "industry_name": "harvardbioscience", "news_provider": "harvardbioscience" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'harvardbioscience_news') except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("News Not Found") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find_all( 'div', class_='small-12 columns event-post-info') if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('h6') title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = str(url_data['href']) if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find('span', class_='date caps') publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find('div', {'class': ""}) description = [] regex = re.compile(r'[\n\xa0]') for string in description_data.stripped_strings: description.append(regex.sub("", str(string.strip()))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "alertlogic", "topic_name": "press-releases", "ticker": "alertlogic_scrapped", "industry_name": "alertlogic", "news_provider": "alertlogic" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() else: print("All news has been scrapped !!") except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False, 'voestalpine_news') article_data = soup.find_all('div', {'class': 'article'}) if article_data: for article in article_data: news_data = article.find_all('section', {'class': ""}) for news in news_data[1::2]: news_dict = Helper.get_news_dict() title_data = news.find('h2') title = title_data.text if title_data else "" url = news.find_next_sibling( ).a['href'] if news.find_next_sibling() else '' description_data = news.find('p') description = description_data.text if description_data else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') publish_date_data = url_soup.find( 'p', {'class': 'meta large inline'}) publish_date = Helper.parse_date( publish_date_data.text.replace('|', "").strip() ) if publish_date_data and publish_date_data.text != '' else '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "voestalpine", "ticker": "voestalpine_scrapped", "industry_name": "voestalpine", "news_provider": "voestalpine" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'voestalpine_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute()
def crawler_news(self): try: loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find('div', {'class': 'block-region-results'}) if news_data: for news in news_data.find_all('tr'): news_dict = Helper.get_news_dict() title_data = news.find('h3') title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.polyone.com/" + str( url_data['href']) if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find('h5', {'class': 'float-left'}) publish_date = Helper.parse_date( str(publish_date_data.text).split('\n')[0] ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest( url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find( 'div', {'class': "block-region-top"}) description_data = description_data.strong.find_all_previous( 'p')[1:-3] description_data.reverse() description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data: description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "polyone", "ticker": "polyone_scrapped", "industry_name": "polyone", "news_provider": "polyone" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 break else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) if response.headers[ 'Content-Type'] == 'application/json; charset=utf-8': response_json = json.loads( response.content.decode('utf-8')) else: print("No data found") break soup = BeautifulSoup(response_json['html'], 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find_all( 'li', { 'class': 'page-list--item is-detailed infinite-nodes--list-item' }) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('h3') title = title_data.text.strip() if title_data else "" url_data = news.find('a', {'href': True}) url = url_data['href'] if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find('div', {'class': 'meta--item'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' description_data = news.find( 'div', {'class': "page-list--text"}) description = description_data.text.strip( ) if description_data else '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "audi", "ticker": "audi_scrapped", "industry_name": "audi", "news_provider": "audi" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): """ This function will scrap news page wise for given url :return: """ try: loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_list = soup.find('div', {'class': 'NewsList'}) if news_list: news_data = news_list.find_all('li') if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('div', {'class': 'title'}) title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.xinyiglass.com/" + str( url_data['href']) if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue regex = re.compile(r'[\n\r\t]') description_data = news.find( 'div', {'class': 'info'}) description = regex.sub( "", description_data.text ) if description_data else '' date = news.find('span') year_month = news.find('em') publish_date = Helper.parse_date( (year_month.text) + "-" + str(date.text)) if date and year_month else '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description.strip(), "text": description.strip(), "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "xinyiglass", "ticker": "xinyiglass_scrapped", "industry_name": "xinyiglass", "news_provider": "xinyiglass" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): loop = True page = 0 while loop: response = crawler.MakeRequest( self.url, 'Post', postData=self.body.format(page=page), headers=self.headers) bulk_obj = DbOperations.Get_object_for_bulkop( False, 'novozymes_news') news_data = json.loads(response.content.decode('utf-8')) if news_data['News']: for news in news_data['News']: news_dict = Helper.get_news_dict() title = news['Title'] if 'Title' in news else '' url = "https://www.novozymes.com" + str( news['Url']) if 'Url' in news else '' publish_date_data = news[ 'CreationDate'] if 'CreationDate' in news else '' publish_date = Helper.parse_date(publish_date_data) description = news['Content'] if 'Content' in news else '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "novozymes", "ticker": "novozymes_scrapped", "industry_name": "novozymes", "news_provider": "novozymes" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'novozymes_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("News Not Found") loop = False
def crawler_news(self): loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, 'hanwhacorp_news') news_data = soup.find('tbody') if news_data and news_data.tr.text.strip() != 'There is no data.': for news in news_data.find_all('tr'): news_dict = Helper.get_news_dict() title_data = news.find('td', {'class': 'title'}) title = title_data.text if title_data else "" publish_date_data = news.find_all('td')[3].text publish_date = Helper.parse_date( publish_date_data ) if publish_date_data and publish_date_data != '' else '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "hanwhacorp", "ticker": "hanwhacorp_scrapped", "industry_name": "hanwhacorp", "news_provider": "hanwhacorp" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'hanwhacorp_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("News Not Found") loop = False
def crawler_news(self): loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, 'zscaler_news') news_data = soup.find_all( 'div', {'class': "col-12 col-md-6 zpb-32 zpb-md-48"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('a') title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = url_data['href'] if url_data else '' publish_date_data = news.find( 'time', {'class': 'text-center bg-sea-green'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find_all( 'div', {'class': "col-sm-12"}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data: if desc.find('a', {'href': 'https://www.zscaler.com/' }) != None: description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "zscaler", "ticker": "zscaler_scrapped", "industry_name": "zscaler", "news_provider": "zscaler" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'zscaler_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("News Not Found") loop = False
def crawler_news(self): """ This function will scrap news page wise for given url :return: """ try: response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find_all('li', {'class': "NewsPanel__item"}) if news_data: for news in news_data: try: news_dict = Helper.get_news_dict() title_data = news.find('div', {'class': 'NewsPanel__body'}) title = title_data.text.strip() if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.kaneka.co.jp/" + str( url_data['href']) if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find( 'time', {'class': 'NewsPanel__time'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data else '' url_response = crawler.MakeRequest( url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find( 'article', {'class': 'articleBody topics__mod'}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "kaneka", "ticker": "kaneka_scrapped", "industry_name": "kaneka", "news_provider": "kaneka" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() else: print("All news has been scrapped !!") except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False, 'jnj_news') news_data = soup.find_all('div', {'class': "MediaPromo-title"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('div', {'class', 'ResponsiveText-text'}) title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = url_data['href'] if url_data else '' url_response = crawler.MakeRequest(url, 'Get') url_soup_obj = BeautifulSoup(url_response.content, 'html.parser') url_response_data = url_soup_obj.find( 'script', {'type': 'application/ld+json'}) url_response_data = json.loads(url_response_data.text) if url_response_data: publish_date = Helper.parse_date( url_response_data['datePublished'] ) if 'datePublished' in url_response_data else '' news_provider = url_response_data['publisher'][ 'name'] if 'publisher' in url_response_data and 'name' in url_response_data[ 'publisher'] else '' industry_name = news_provider news_dict.update({ "news_provider": news_provider, "industry_name": industry_name, "publishedAt": publish_date, 'date': publish_date }) description_data = url_soup_obj.find( 'div', {'class': 'FullBleedLede-dek'}) description = description_data.text if description_data else '' news_dict.update({ "title": title, "url": url, "formatted_sub_header": title, "description": description, "link": url }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'jnj_well_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("News Not Found") loop = False
def crawler_news(self): response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False, 'denso_news') news_data_1 = soup.find_all( 'div', { 'class': "menuBlock01--border menuBlock01--small menuBlock01 menuBlock01--right" }) news_data_2 = soup.find_all( 'div', {'class': "menuBlock01--border menuBlock01--small menuBlock01 "}) news_data = news_data_1 + news_data_2 if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('span', {"class": "menuBlock01__headingText"}) title = title_data.text.strip() if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.denso.com" + str( url_data['href']) if url_data else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_response_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_response_soup.find( 'span', {'class': 'heading01__copy heading01__copy--lead'}) description = description_data.text.strip( ) if description_data else '' publish_date_data = news.find('p', {"class": "menuBlock01__text"}) publish_date_data.span.decompose() publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' news_dict.update({ "title": title, "url": url, "formatted_sub_header": title, "description": description, "link": url, "publishedAt": publish_date, 'date': publish_date, "news_provider": "denso" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'denso_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute()
def crawler_news(self): response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False, 'fujielectric_news') news_data = soup.find('div', {'id': 'tab_news_release'}) if news_data: for news in news_data.find_all('dt'): news_dict = Helper.get_news_dict() title_data = news.find_next_sibling().find_next_sibling().a title = title_data.text if title_data else "" url_data = news.find_next_sibling().find_next_sibling().a url = url_data['href'] if url_data else '' publish_date_data = news.text if news.text != '' else '' publish_date = Helper.parse_date(publish_date_data) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "fujielectric", "ticker": "fujielectric_scrapped", "industry_name": "fujielectric", "news_provider": "fujielectric" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'fujielectric_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() else: print("News Not Found")
def crawler_news(self): try: loop = True while loop: response = crawler.MakeRequest(self.url, 'Post', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = json.loads(response.content.decode('utf-8')) if news_data: for news in news_data[:1]: news_dict = Helper.get_news_dict() title = news['Title'] if 'Title' in news else '' # Check if already present unqUrl = hashlib.md5(title.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_title_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. title - ( " + title + " )") continue publish_date_data = news[ 'EntryDate'] if 'EntryDate' in news else '' publish_date = Helper.parse_date(publish_date_data) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "omron", "ticker": "omron_scrapped", "industry_name": "omron", "news_provider": "omron" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() self.body['StartRange'] += 1 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = json.loads(response.content.decode('utf-8')) if news_data: for news in news_data['GetPressReleaseListResult']: news_dict = Helper.get_news_dict() title = news['Headline'] if 'Headline' in news else "" url = "https://www.craneco.com" + str( news['LinkToDetailPage'] ) if 'LinkToDetailPage' in news else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date = Helper.parse_date( news['PressReleaseDate'] ) if 'PressReleaseDate' in news else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find( 'div', {'class': "module_body clearfix"}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "crane", "ticker": "crane_scrapped", "industry_name": "crane", "news_provider": "crane" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() else: print("All news has been scrapped !!") except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find('tbody') if news_data and news_data.tr.text.strip( ) != 'There is no data.': for news in news_data.find_all('tr'): news_dict = Helper.get_news_dict() title_data = news.find('td', {'class': 'title'}) title = title_data.text if title_data else "" # Check if already present unqUrl = hashlib.md5(title.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_title_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. title - ( " + title + " )") continue publish_date_data = news.find_all('td')[3].text publish_date = Helper.parse_date( publish_date_data ) if publish_date_data and publish_date_data != '' else '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "hanwhacorp", "ticker": "hanwhacorp_scrapped", "industry_name": "hanwhacorp", "news_provider": "hanwhacorp" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False, 'enersys_news') news_data = soup.find( 'div', {'class': "content-listing__items glide__slides"}) if news_data: for news in news_data.find_all( 'a', {'class': 'content-listing__item glide__slide col-lg-3 '}): news_dict = Helper.get_news_dict() regex = re.compile(r'[\r\n\xa0]') title_data = news.find('h3') title = regex.sub("", str( title_data.text.strip())) if title_data else "" url = "https://www.enersys.com" + str(news['href']) publish_date_data = news.find( 'p', {'class': 'content-listing__item-date'}) publish_date = Helper.parse_date( publish_date_data.text.strip() ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find( 'div', {'class': "standard-page__body"}) description = [] for desc in description_data.find_all('p'): description.append(regex.sub("", str(desc.text))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "enersys", "ticker": "enersys_scrapped", "industry_name": "enersys", "news_provider": "enersys" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'enersys_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() else: print("News Not Found")
def crawler_news(self): response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop(False, 'olympus_news') news_data = soup.find('ul', {'class': "list-news-01"}) if news_data: for news in news_data.find_all('li'): news_dict = Helper.get_news_dict() title_data = news.find('span', {'class': 'text'}) title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.olympus-global.com" + str( url_data['href']) if url_data else '' publish_date_data = news.find('span', {'class': 'date'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find('div', {'class': "area-content"}) description = [] regex = re.compile(r'[\n\xa0]') for desc in description_data.find_all('b'): description.append(regex.sub("", str(desc.text.strip()))) description = ''.join(description) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "olympus", "ticker": "olympus_scrapped", "industry_name": "olympus", "news_provider": "olympus" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, 'olympus_news') if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0: bulk_obj.execute() else: print("News Not Found")
def crawler_news(self): try: response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) article_data = soup.find_all('div', {'class': 'article'}) if article_data: for article in article_data: news_data = article.find_all('section', {'class': ""}) for news in news_data[1::2]: news_dict = Helper.get_news_dict() title_data = news.find('h2') title = title_data.text if title_data else "" url = news.find_next_sibling( ).a['href'] if news.find_next_sibling() else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue description_data = news.find('p') description = description_data.text if description_data else '' url_response = crawler.MakeRequest( url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') publish_date_data = url_soup.find( 'p', {'class': 'meta large inline'}) publish_date = Helper.parse_date( publish_date_data.text.replace('|', "").strip() ) if publish_date_data and publish_date_data.text != '' else '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "voestalpine", "ticker": "voestalpine_scrapped", "industry_name": "voestalpine", "news_provider": "voestalpine" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: loop = True page = 0 while loop: response = crawler.MakeRequest( self.url, 'Post', postData=self.body.format(page=page), headers=self.headers) bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = json.loads(response.content.decode('utf-8')) if news_data['News']: for news in news_data['News']: news_dict = Helper.get_news_dict() title = news['Title'] if 'Title' in news else '' url = "https://www.novozymes.com" + str( news['Url']) if 'Url' in news else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news[ 'CreationDate'] if 'CreationDate' in news else '' publish_date = Helper.parse_date(publish_date_data) description = news[ 'Content'] if 'Content' in news else '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "novozymes", "ticker": "novozymes_scrapped", "industry_name": "novozymes", "news_provider": "novozymes" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: loop = True page = 1 while loop: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find_all('div', {'class': "contentCollection--item"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('a') title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://corporate.exxonmobil.com" + str( url_data['href']) if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue description_data = news.find( 'span', {'class': 'contentCollection--description p'}) description = description_data.text if description_data else '' publish_date_data = news.find('span', {'class': 'date'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' news_dict.update({ "title": title, "url": url, "formatted_sub_header": title, "description": description, "link": url, "publishedAt": publish_date, 'date': publish_date, "news_provider": "exxonmobil_corporation", "company_id": "exxonmobil_corporation", "ticker": "exxonmobil_corporation_scrapped", "industry_name": "exxonmobil_corporation" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: response = crawler.MakeRequest(self.url, 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find('ul', {'class': "newsPressList tabContent"}) if news_data: for news in news_data.find_all('li'): news_dict = Helper.get_news_dict() title_data = news.find('h3') title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = url_data['href'] if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find('span', {'class': 'date'}) publish_date = Helper.parse_date( publish_date_data.text ) if publish_date_data and publish_date_data.text != '' else '' news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "LnT", "ticker": "LnT_scrapped", "industry_name": "LnT", "news_provider": "LnT" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops'] ) > 0: bulk_obj.execute() except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler_news(self): try: loop = True page = 1 while loop: try: response = crawler.MakeRequest(self.url.format(page=page), 'Get', postData=self.body, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) news_data = soup.find_all('div', {'class': "media"}) if news_data: for news in news_data: news_dict = Helper.get_news_dict() title_data = news.find('h4', {'class': 'media-heading'}) title = title_data.text if title_data else "" url_data = news.find('a', {'href': True}) url = "https://www.hbfuller.com" + str( url_data['href']) if url_data else '' # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue publish_date_data = news.find( 'div', {'class': 'listing-date'}) publish_date = Helper.parse_date( str(publish_date_data.text).strip() ) if publish_date_data and publish_date_data.text != '' else '' url_response = crawler.MakeRequest( url, 'Get', postData=self.body, headers=self.headers) url_soup = BeautifulSoup(url_response.content, 'html.parser') description_data = url_soup.find( 'div', { 'class': 'row ar-body' }).find('div', { 'class': "col-xs-12 col-sm-8 col-md-9" }).find('div', { 'class': 'col-sm-12' }).find('div', {'style': ''}) description = description_data.text.strip().split( '\n') description = ''.join(description[1:]) news_dict.update({ "title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(), "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "description": description, "text": description, "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date, "company_id": "hbfuller", "ticker": "hbfuller_scrapped", "industry_name": "hbfuller", "news_provider": "hbfuller" }) bulk_obj.insert(news_dict) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 100: bulk_obj.execute() bulk_obj = DbOperations.Get_object_for_bulkop( False, self.news_collection) if len(bulk_obj._BulkOperationBuilder__bulk. __dict__['ops']) > 0: bulk_obj.execute() page += 1 else: print("All news has been scrapped !!") loop = False except AttributeError as e: print("All news has been scrapped !!") loop = False except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)