示例#1
0
    def crawler_news(self):
        try:
            response = crawler.MakeRequest(self.url,
                                           'Get',
                                           postData=self.body,
                                           headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, self.news_collection)
            news_data = json.loads(response.content.decode('utf-8'))
            if news_data:
                for news in news_data['GetPressReleaseListResult']:

                    news_dict = Helper.get_news_dict()

                    title = news['Headline'] if 'Headline' in news else ""
                    url = news['LinkToUrl'] if 'LinkToUrl' in news else ""

                    # Check if already present
                    unqUrl = hashlib.md5(url.encode()).hexdigest()
                    chkIsExists = DbOperations.GetData(
                        self.news_collection, {"news_url_uid": str(unqUrl)},
                        {}, QueryType.one)
                    if (chkIsExists):
                        print("Already saved. url - ( " + url + " )")
                        continue

                    description = news[
                        'ShortBody'] if 'ShortBody' in news else ""
                    news_url_uid = news[
                        'PressReleaseId'] if 'PressReleaseId' in news else ""
                    publish_date = Helper.parse_date(
                        news['PressReleaseDate']
                    ) if 'PressReleaseDate' in news else ""

                    news_dict.update({
                        "title": title,
                        "url": url,
                        "formatted_sub_header": title,
                        "description": description,
                        "link": url,
                        "publishedAt": publish_date,
                        'date': publish_date,
                        "news_url_uid": news_url_uid,
                        "news_provider": "ppg"
                    })

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, self.news_collection)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#2
0
 def processNewsBasedOnTitle(news_collection, processed_collection,
                             company):
     isInserted = 0
     rowCount = 0
     for row in DbOperations.GetData(news_collection, {
             "is_used": {
                 '$exists': False
             },
             "news_provider": company
     }, {}):
         try:
             DbOperations.InsertIntoMongo(processed_collection, row)
             isInserted = 1
             print('Success in inserting Process collection => [title: "' +
                   row['title'] + '"]')
             DbOperations.Update_oneMongo(
                 news_collection, {"news_title_uid": row['news_title_uid']},
                 {"$set": {
                     "is_used": 1
                 }})
             rowCount = rowCount + 1
         except Exception as e:
             print(
                 'Error in inserting Process collection => [title: "' +
                 row['title'] + '"]', e)
             pass
     return isInserted, rowCount
    def crawler_news(self):
        try:
            loop = True
            count = 0
            while loop:
                response = crawler.MakeRequest(self.url.format(count=count),'Get',postData=self.body,headers=self.header)
                soup = BeautifulSoup(response.content, 'html.parser')
                bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection)
                news_data = soup.find_all('li', {'class': "wd_item"})
                if news_data:
                    for news in news_data:
                        news_dict = Helper.get_news_dict()

                        title_data = news.find('div',{'class','wd_title'})
                        title = title_data.text if title_data else ""

                        url_data = news.find('a', {'href': True})
                        url = url_data['href'] if url_data else ''

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(self.news_collection, {"news_url_uid": str(unqUrl)}, {},
                                                           QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue

                        description_data = news.find('div',{'class':'wd_subtitle'})
                        description = description_data.text if description_data else ''

                        text_data = news.find('div', {'class': 'wd_summary'})
                        text = text_data.text if text_data else ''

                        publish_date_data = news.find('div',{'class':'wd_date'})
                        publish_date = Helper.parse_date(publish_date_data.text) if publish_date_data and publish_date_data.text != '' else ''

                        news_dict.update(
                            {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(),
                             "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(),
                             "description": description, "text": description,
                             "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date,
                             "company_id": "martin_corporation", "ticker": "martin_corporation_scrapped", "industry_name": "martin_corporation",
                             "news_provider": "martin_corporation"})

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0:
                        bulk_obj.execute()

                    count += 100
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n",exc_info=True)
示例#4
0
    def crawler(self):
        try:
            response = crawler.MakeRequest(self.url, "Get")
            soup = BeautifulSoup(response.content, "html.parser")
            data = []
            boxs = soup.find_all("div", {"class": 'news-box span3 left'})
            for box in boxs:
                datadict = Helper.get_news_dict()
                url = "https://www.pemex.com" + box.find("a")['href']
                # Check if already present
                unqUrl = hashlib.md5(url.encode()).hexdigest()
                chkIsExists = DbOperations.GetData(
                    self.news_collection, {"news_url_uid": str(unqUrl)}, {},
                    QueryType.one)
                if (chkIsExists):
                    print("Already saved. url - ( " + url + " )")
                    continue

                datadict.update(
                    {"url": "https://www.pemex.com" + box.find("a")['href']})
                description = self.fetchDescription("https://www.pemex.com" +
                                                    box.find("a")['href'])
                datadict.update({
                    "date":
                    box.find("p", {
                        "class": "news-meta news-date"
                    }).text,
                    "news_provider":
                    "pemex",
                    "formatted_sub_header":
                    box.find("div", {
                        "class": "ms-WPBody h2"
                    }).text,
                    "publishedAt":
                    Helper.parse_date(
                        box.find("p", {
                            "class": "news-meta news-date"
                        }).text),
                    "description":
                    description,
                    "title":
                    box.find("div", {
                        "class": "ms-WPBody h2"
                    }).text,
                    "link":
                    self.url,
                    "text":
                    description,
                    "company_id":
                    "pemex",
                    "news_url_uid":
                    hashlib.md5(("https://www.pemex.com" +
                                 box.find("a")['href']).encode()).hexdigest()
                })
                data.append(datadict)

            DbOperations.InsertIntoMongo(self.news_collection, data)
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#5
0
    def crawler_news(self):
        try:
            response = crawler.MakeRequest(self.url,'Get',postData=self.body,headers=self.headers)
            bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection)
            news_data = json.loads(response.content.decode('utf-8'))
            if news_data:
                for news in news_data['NewsCategories'][0]['2020']:
                    news_dict = Helper.get_news_dict()

                    title = news['title'] if 'title' in news else ''

                    url = "https://www.broadcom.com/"+str(news['_url_']) if '_url_' in news else ''

                    # Check if already present
                    unqUrl = hashlib.md5(url.encode()).hexdigest()
                    chkIsExists = DbOperations.GetData(self.news_collection, {"news_url_uid": str(unqUrl)}, {},
                                                       QueryType.one)
                    if (chkIsExists):
                        print("Already saved. url - ( " + url + " )")
                        continue

                    publish_date_data = news['PublishDate'] if 'PublishDate' in news else ''
                    publish_date = Helper.parse_date(publish_date_data)

                    content_type = news['content_type'] if 'content_type' in news else ''
                    cid = news['content_id'] if 'content_id' in news else ''
                    final_url = "https://www.broadcom.com/api/getjsonbyurl?vanityurl={url}&locale=avg_en&updateddate=&ctype={content_type}&cid={cid}".format(
                        url=url, content_type=content_type, cid=cid)
                    url_response = crawler.MakeRequest(final_url, 'Get', postData=self.body, headers=self.headers)
                    url_json = json.loads(url_response.content.decode('utf-8'))
                    url_soup = BeautifulSoup(url_json['Body'], 'html.parser')
                    description = []
                    regex = re.compile(r'[\n\xa0]')
                    for desc in url_soup.find_all('p'):
                        description.append(regex.sub("", str(desc.text)))

                    description = ''.join(description)

                    news_dict.update(
                        {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(),
                         "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(),
                         "description": description, "text": description,
                         "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date,
                         "company_id": "brodcom", "ticker": "brodcom_scrapped", "industry_name": "brodcom",
                         "news_provider": "brodcom"})

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0:
                    bulk_obj.execute()
            else:
                print("All news has been scrapped !!")
        except Exception as e:
            self.logger.error(f"Error Occured : \n",exc_info=True)
示例#6
0
    def crawler_news(self):
        try:
            response = crawler.MakeRequest(self.url,'Get',postData=self.body,headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection)
            news_data = json.loads(response.content.decode('utf-8'))
            if news_data:
                for news in news_data['GetPressReleaseListResult']:
                    news_dict = Helper.get_news_dict()

                    title = news['Headline'] if 'Headline' in news else ""

                    url = "https://news.fiveyearsout.com"+str(news['LinkToDetailPage']) if 'LinkToDetailPage' in news else ''

                    # Check if already present
                    unqUrl = hashlib.md5(url.encode()).hexdigest()
                    chkIsExists = DbOperations.GetData(self.news_collection, {"news_url_uid": str(unqUrl)}, {},
                                                       QueryType.one)
                    if (chkIsExists):
                        print("Already saved. url - ( " + url + " )")
                        continue

                    publish_date = Helper.parse_date(news['PressReleaseDate']) if 'PressReleaseDate' in news else ''

                    url_response = crawler.MakeRequest(url, 'Get', postData=self.body, headers=self.headers)
                    url_soup = BeautifulSoup(url_response.content, 'html.parser')
                    description_data = url_soup.find('div',{'class':"module_body"})

                    description = []
                    regex = re.compile(r'[\n\xa0]')
                    for desc in description_data.find_all('p'):
                        description.append(regex.sub("", str(desc.text)))
                    description= ''.join(description)

                    news_dict.update(
                        {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(),
                         "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(),
                         "description": description, "text": description,
                         "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date,
                         "company_id": "arrow", "ticker": "arrow_scrapped", "industry_name": "arrow",
                         "news_provider": "arrow"})

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0:
                    bulk_obj.execute()
            else:
                print("All news has been scrapped !!")
        except Exception as e:
            self.logger.error(f"Error Occured : \n",exc_info=True)
示例#7
0
    def crawler_news(self):
        try:
            response = crawler.MakeRequest(self.url,'Post',postData=self.body,headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection)
            news_data = soup.find_all('article', {'class': "teaser img-float img-small clearfix"})
            if news_data:
                for news in news_data:
                    news_dict = Helper.get_news_dict()

                    title_data = news.find('h3')
                    title = title_data.text if title_data else ""

                    url_data = news.find('a', {'href': True})
                    url = "https://www.roche.com"+str(url_data['href']) if url_data else ''

                    # Check if already present
                    unqUrl = hashlib.md5(url.encode()).hexdigest()
                    chkIsExists = DbOperations.GetData(self.news_collection, {"news_url_uid": str(unqUrl)}, {},
                                                       QueryType.one)
                    if (chkIsExists):
                        print("Already saved. url - ( " + url + " )")
                        continue

                    description_data = news.find('p')
                    description = description_data.text if description_data else ''

                    publish_date_data = news.find('time')
                    publish_date_data.span.decompose()
                    publish_date = Helper.parse_date(publish_date_data.text) if publish_date_data and publish_date_data.text != '' else ''

                    news_dict.update(
                        {"title": title, "news_title_uid": hashlib.md5(title.encode()).hexdigest(),
                         "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(),
                         "description": description, "text": description,
                         "publishedAt": publish_date, 'date': publish_date, "publishedAt_scrapped": publish_date,
                         "company_id": "roche", "ticker": "roche_scrapped", "industry_name": "roche",
                         "news_provider": "roche"})

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) >100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(False,self.news_collection)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0:
                    bulk_obj.execute()
        except Exception as e:
            self.logger.error(f"Error Occured : \n",exc_info=True)
示例#8
0
    def crawler_news(self):
        try:
            loop = True
            page = 1
            while loop:
                response = crawler.MakeRequest(self.url.format(page=page),
                                               'Get',
                                               postData=self.body,
                                               headers=self.headers)
                if (response.status_code == 200):
                    soup = BeautifulSoup(response.content, 'html.parser')
                else:
                    break
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_data = soup.find('div', {
                    'class': "x-main full"
                }).find_all('div', {'class': 'x-container max width'})
                if news_data:
                    for news in news_data:
                        news_dict = Helper.get_news_dict()

                        title_data = news.find('h2', {'class': 'entry-title'})
                        title = title_data.text.strip() if title_data else ""

                        url_data = news.find('a', {'href': True})
                        url = url_data['href'] if url_data else ''

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue

                        publish_date_data = news.find('time',
                                                      {'class': 'entry-date'})
                        publish_date = Helper.parse_date(
                            publish_date_data.text
                        ) if publish_date_data and publish_date_data.text != '' else ''

                        url_response = crawler.MakeRequest(
                            url,
                            'Get',
                            postData=self.body,
                            headers=self.headers)
                        url_soup = BeautifulSoup(url_response.content,
                                                 'html.parser')
                        description_data = url_soup.find(
                            'div', {'class': "entry-content content"})

                        description = []
                        regex = re.compile(r'[\n\xa0]')
                        if description_data.h2 != None:
                            for desc in description_data.h2.find_all_previous(
                                    "p")[::-1]:
                                description.append(
                                    regex.sub("", str(desc.text)))
                        else:
                            for desc in description_data.find_all('p'):
                                description.append(
                                    regex.sub("", str(desc.text)))
                        description = ''.join(description)

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "url":
                            url,
                            "link":
                            url,
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest(),
                            "description":
                            description,
                            "text":
                            description,
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "titanx",
                            "ticker":
                            "titanx_scrapped",
                            "industry_name":
                            "titanx",
                            "news_provider":
                            "titanx"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()

                    page += 1
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#9
0
    def crawler(self):
        try:
            counter = 1
            data = []
            while True:

                response = crawler.MakeRequest(self.url, "Get")
                soup = BeautifulSoup(response.content, "html.parser")
                if response.status_code == 200:

                    boxs = soup.find_all("div", {"class": 'item'})
                    for box in boxs:
                        date = Helper.parse_date(
                            box.find("p", {
                                "class": "fade"
                            }).text)
                        if date:
                            if date.year < datetime.datetime.now().year:
                                break

                        url = "https://www.bd.com/" + box.find("a")['href']

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue
                        datadict = Helper.get_news_dict()
                        datadict.update({
                            "url":
                            "https://www.bd.com/" + box.find("a")['href']
                        })
                        description = self.fetchDescription(
                            "https://www.bd.com/" + box.find("a")['href'])
                        datadict.update({
                            "date":
                            Helper.parse_date(
                                box.find("p", {
                                    "class": "fade"
                                }).text),
                            "news_provider":
                            "Becton, Dickinson and Company",
                            "formatted_sub_header":
                            box.find("a").text.strip(),
                            "publishedAt":
                            Helper.parse_date(
                                box.find("p", {
                                    "class": "fade"
                                }).text),
                            "description":
                            description,
                            "title":
                            box.find("a").text.strip(),
                            "news_title_uid":
                            hashlib.md5(box.find(
                                "a").text.strip().encode()).hexdigest(),
                            "link":
                            url,
                            "text":
                            description,
                            "ticker":
                            "bd_scrapped",
                            "industry_name":
                            "Becton, Dickinson and Company",
                            "company_id":
                            "Becton, Dickinson and Company",
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest()
                        })
                        data.append(datadict)
                else:
                    break
            DbOperations.InsertIntoMongo(self.news_collection, data)
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#10
0
    def crawler_news(self):
        try:
            loop = True
            page = 1
            while loop:
                response = crawler.MakeRequest(self.url.format(page=page),
                                               'Get',
                                               postData=self.body,
                                               headers=self.headers)
                soup = BeautifulSoup(response.content, 'html.parser')
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_data = soup.find('tbody')
                if news_data and news_data.tr.text.strip(
                ) != 'There is no data.':
                    for news in news_data.find_all('tr'):
                        news_dict = Helper.get_news_dict()

                        title_data = news.find('td', {'class': 'title'})
                        title = title_data.text if title_data else ""

                        # Check if already present
                        unqUrl = hashlib.md5(title.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_title_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. title - ( " + title + " )")
                            continue

                        publish_date_data = news.find_all('td')[3].text
                        publish_date = Helper.parse_date(
                            publish_date_data
                        ) if publish_date_data and publish_date_data != '' else ''

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "hanwhacorp",
                            "ticker":
                            "hanwhacorp_scrapped",
                            "industry_name":
                            "hanwhacorp",
                            "news_provider":
                            "hanwhacorp"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()

                    page += 1
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#11
0
    def crawler_news(self):
        try:
            loop = True
            while loop:
                response = crawler.MakeRequest(self.url,
                                               'Post',
                                               postData=self.body,
                                               headers=self.headers)
                soup = BeautifulSoup(response.content, 'html.parser')
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_data = json.loads(response.content.decode('utf-8'))
                if news_data:
                    for news in news_data[:1]:
                        news_dict = Helper.get_news_dict()

                        title = news['Title'] if 'Title' in news else ''

                        # Check if already present
                        unqUrl = hashlib.md5(title.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_title_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. title - ( " + title + " )")
                            continue

                        publish_date_data = news[
                            'EntryDate'] if 'EntryDate' in news else ''
                        publish_date = Helper.parse_date(publish_date_data)

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "omron",
                            "ticker":
                            "omron_scrapped",
                            "industry_name":
                            "omron",
                            "news_provider":
                            "omron"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()

                    self.body['StartRange'] += 1
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#12
0
    def crawler_news(self):
        try:
            loop = True
            page = 1
            while loop:
                response = crawler.MakeRequest(self.url.format(page=page),
                                               'Get',
                                               postData=self.body,
                                               headers=self.headers)
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_data = json.loads(response.content.decode('utf-8'))
                if news_data:
                    for news in news_data:
                        news_dict = Helper.get_news_dict()

                        title = news['title'][
                            'rendered'] if 'title' in news and 'rendered' in news[
                                'title'] else ''

                        url = news['link'] if 'link' in news else ''

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue

                        publish_date_data = news[
                            'date_gmt'] if 'date_gmt' in news else ''
                        publish_date = Helper.parse_date(publish_date_data)

                        description_data = BeautifulSoup(
                            news['acf']['sections'][0]['text'], 'html.parser')

                        description = []
                        regex = re.compile(r'[\n\xa0]')
                        for desc in description_data.find_all('p'):
                            description.append(regex.sub("", str(desc.text)))
                        description = ''.join(description)

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "url":
                            url,
                            "link":
                            url,
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest(),
                            "description":
                            description,
                            "text":
                            description,
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "tereos",
                            "ticker":
                            "tereos_scrapped",
                            "industry_name":
                            "tereos",
                            "news_provider":
                            "tereos"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()

                    page += 1
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#13
0
    def crawler_news(self):
        try:
            loop = True
            page = 1
            while loop:
                response = crawler.MakeRequest(self.url.format(page=page),
                                               'Get',
                                               postData=self.body,
                                               headers=self.headers)
                if response.headers[
                        'Content-Type'] == 'application/json; charset=utf-8':
                    response_json = json.loads(
                        response.content.decode('utf-8'))
                else:
                    print("No data found")
                    break
                soup = BeautifulSoup(response_json['html'], 'html.parser')
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_data = soup.find_all(
                    'li', {
                        'class':
                        'page-list--item is-detailed infinite-nodes--list-item'
                    })
                if news_data:
                    for news in news_data:
                        news_dict = Helper.get_news_dict()

                        title_data = news.find('h3')
                        title = title_data.text.strip() if title_data else ""

                        url_data = news.find('a', {'href': True})
                        url = url_data['href'] if url_data else ''

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue

                        publish_date_data = news.find('div',
                                                      {'class': 'meta--item'})
                        publish_date = Helper.parse_date(
                            publish_date_data.text
                        ) if publish_date_data and publish_date_data.text != '' else ''

                        description_data = news.find(
                            'div', {'class': "page-list--text"})
                        description = description_data.text.strip(
                        ) if description_data else ''

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "url":
                            url,
                            "link":
                            url,
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest(),
                            "description":
                            description,
                            "text":
                            description,
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "audi",
                            "ticker":
                            "audi_scrapped",
                            "industry_name":
                            "audi",
                            "news_provider":
                            "audi"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()

                    page += 1
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#14
0
    def crawler_news(self):
        try:
            loop = True
            page = 1
            while loop:
                response = crawler.MakeRequest(self.url.format(page=page),
                                               'Get',
                                               postData=self.body,
                                               headers=self.headers)
                soup = BeautifulSoup(response.content, 'html.parser')
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_data = soup.find('section',
                                      {'class': "cmp-list news-list"})
                if news_data:
                    if news_data.find_all('div', {'class', 'col-12'}):
                        for news in news_data.find_all('div',
                                                       {'class', 'col-12'}):
                            news_dict = Helper.get_news_dict()

                            title_data = news.find(
                                'div', {'class': "search-result-title"})
                            title = title_data.text if title_data else ""

                            url_data = news.find('a', {'href': True})
                            url = "https://www.lonza.com" + str(
                                url_data['href']) if url_data else ''

                            # Check if already present
                            unqUrl = hashlib.md5(url.encode()).hexdigest()
                            chkIsExists = DbOperations.GetData(
                                self.news_collection,
                                {"news_url_uid": str(unqUrl)}, {},
                                QueryType.one)
                            if (chkIsExists):
                                print("Already saved. url - ( " + url + " )")
                                continue

                            publish_date_data = news.find(
                                'div', {'class': 'search-result-label'})
                            publish_date = Helper.parse_date(
                                publish_date_data.text
                            ) if publish_date_data and publish_date_data.text != '' else ''

                            url_response = crawler.MakeRequest(
                                url,
                                'Get',
                                postData=self.body,
                                headers=self.headers)
                            url_soup = BeautifulSoup(url_response.content,
                                                     'html.parser')
                            description_data = url_soup.find(
                                'section', {'class': "cmp-news-listing"})

                            description = []
                            regex = re.compile(r'[\n\xa0]')
                            for desc in description_data.find_all('li'):
                                description.append(
                                    regex.sub("", str(desc.text)))
                            description = ''.join(description)

                            news_dict.update({
                                "title":
                                title,
                                "news_title_uid":
                                hashlib.md5(title.encode()).hexdigest(),
                                "url":
                                url,
                                "link":
                                url,
                                "news_url_uid":
                                hashlib.md5(url.encode()).hexdigest(),
                                "description":
                                description,
                                "text":
                                description,
                                "publishedAt":
                                publish_date,
                                'date':
                                publish_date,
                                "publishedAt_scrapped":
                                publish_date,
                                "company_id":
                                "lonza",
                                "ticker":
                                "lonza_scrapped",
                                "industry_name":
                                "lonza",
                                "news_provider":
                                "lonza"
                            })

                            bulk_obj.insert(news_dict)

                            if len(bulk_obj._BulkOperationBuilder__bulk.
                                   __dict__['ops']) > 100:
                                bulk_obj.execute()
                                bulk_obj = DbOperations.Get_object_for_bulkop(
                                    False, self.news_collection)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 0:
                            bulk_obj.execute()

                        page += 1
                    else:
                        print("All news has been scrapped !!")
                        loop = False
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#15
0
    def crawler_news(self):
        try:
            response = crawler.MakeRequest(self.url,
                                           'Get',
                                           postData=self.body,
                                           headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, self.news_collection)
            news_data = soup.find('div', {'id': "contents"})
            if news_data:
                for news in news_data.find_all('dt', {'class': 'mgnT15'}):
                    news_dict = Helper.get_news_dict()

                    title_data = news.find_next_sibling().a
                    title = title_data.text if title_data else ""

                    url_data = news.find_next_sibling().a
                    url = "https://www.toray.in/india/news/" + str(
                        url_data['href']) if url_data else ''

                    # Check if already present
                    unqUrl = hashlib.md5(url.encode()).hexdigest()
                    chkIsExists = DbOperations.GetData(
                        self.news_collection, {"news_url_uid": str(unqUrl)},
                        {}, QueryType.one)
                    if (chkIsExists):
                        print("Already saved. url - ( " + url + " )")
                        continue

                    publish_date = Helper.parse_date(
                        str(news.text).split('\n')
                        [0]) if news and news.text != '' else ''

                    url_response = crawler.MakeRequest(url,
                                                       'Get',
                                                       postData=self.body,
                                                       headers=self.headers)
                    url_soup = BeautifulSoup(url_response.content,
                                             'html.parser')
                    description_data = url_soup.find_all(
                        'p', {'class': "mgnB20"})

                    description = []
                    regex = re.compile(r'[\n\xa0]')
                    for desc in description_data:
                        description.append(regex.sub("", str(desc.text)))
                    description = ''.join(description)

                    news_dict.update({
                        "title":
                        title,
                        "news_title_uid":
                        hashlib.md5(title.encode()).hexdigest(),
                        "url":
                        url,
                        "link":
                        url,
                        "news_url_uid":
                        hashlib.md5(url.encode()).hexdigest(),
                        "description":
                        description,
                        "text":
                        description,
                        "publishedAt":
                        publish_date,
                        'date':
                        publish_date,
                        "publishedAt_scrapped":
                        publish_date,
                        "company_id":
                        "toray",
                        "ticker":
                        "toray_scrapped",
                        "industry_name":
                        "toray",
                        "news_provider":
                        "toray"
                    })

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, self.news_collection)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#16
0
    def crawler_news(self):
        try:
            response = crawler.MakeRequest(self.url,
                                           'Get',
                                           postData=self.body,
                                           headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, self.news_collection)
            article_data = soup.find_all('div', {'class': 'article'})
            if article_data:
                for article in article_data:
                    news_data = article.find_all('section', {'class': ""})
                    for news in news_data[1::2]:

                        news_dict = Helper.get_news_dict()

                        title_data = news.find('h2')
                        title = title_data.text if title_data else ""

                        url = news.find_next_sibling(
                        ).a['href'] if news.find_next_sibling() else ''

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue

                        description_data = news.find('p')
                        description = description_data.text if description_data else ''

                        url_response = crawler.MakeRequest(
                            url,
                            'Get',
                            postData=self.body,
                            headers=self.headers)
                        url_soup = BeautifulSoup(url_response.content,
                                                 'html.parser')
                        publish_date_data = url_soup.find(
                            'p', {'class': 'meta large inline'})
                        publish_date = Helper.parse_date(
                            publish_date_data.text.replace('|', "").strip()
                        ) if publish_date_data and publish_date_data.text != '' else ''

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "url":
                            url,
                            "link":
                            url,
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest(),
                            "description":
                            description,
                            "text":
                            description,
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "voestalpine",
                            "ticker":
                            "voestalpine_scrapped",
                            "industry_name":
                            "voestalpine",
                            "news_provider":
                            "voestalpine"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#17
0
    def crawler_news(self):
        try:
            loop = True
            page = 0
            while loop:
                response = crawler.MakeRequest(
                    self.url,
                    'Post',
                    postData=self.body.format(page=page),
                    headers=self.headers)
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_data = json.loads(response.content.decode('utf-8'))
                if news_data['News']:
                    for news in news_data['News']:
                        news_dict = Helper.get_news_dict()

                        title = news['Title'] if 'Title' in news else ''

                        url = "https://www.novozymes.com" + str(
                            news['Url']) if 'Url' in news else ''

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue

                        publish_date_data = news[
                            'CreationDate'] if 'CreationDate' in news else ''
                        publish_date = Helper.parse_date(publish_date_data)

                        description = news[
                            'Content'] if 'Content' in news else ''

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "url":
                            url,
                            "link":
                            url,
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest(),
                            "description":
                            description,
                            "text":
                            description,
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "novozymes",
                            "ticker":
                            "novozymes_scrapped",
                            "industry_name":
                            "novozymes",
                            "news_provider":
                            "novozymes"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()

                    page += 1
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#18
0
    def crawler(self):
        try:
            page = 1
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, self.news_collection)
            while True:
                response = crawler.MakeRequest(self.url.format(page=page),
                                               "Get",
                                               headers=self.headers)
                if 'we did not find any results related' in response.text:
                    break
                soup = BeautifulSoup(response.content, "html.parser")
                boxs = soup.find_all("li", {"class": 'utc-cards--item'})
                for box in boxs:
                    date = box.find("time", {"class": "utc-card--date"}).text
                    if date:
                        date = Helper.parse_date(date)
                        if date.year < datetime.datetime.now().year:
                            break
                    datadict = Helper.get_news_dict()
                    datadict.update(
                        {"url": "https://www.rtx.com" + box.find("a")['href']})
                    description = self.fetchDescription("https://www.rtx.com" +
                                                        box.find("a")['href'])

                    url = "https://www.rtx.com" + box.find("a")['href']

                    # Check if already present
                    unqUrl = hashlib.md5(url.encode()).hexdigest()
                    chkIsExists = DbOperations.GetData(
                        self.news_collection, {"news_url_uid": str(unqUrl)},
                        {}, QueryType.one)
                    if (chkIsExists):
                        print("Already saved. url - ( " + url + " )")
                        continue

                    datadict.update({
                        "date":
                        date,
                        "news_provider":
                        "UNITED TECHNOLOGIES CORPORATION",
                        "formatted_sub_header":
                        box.find("a").text,
                        "publishedAt":
                        date,
                        "description":
                        description,
                        "title":
                        box.find("a").text,
                        "link":
                        "https://www.rtx.com" + box.find("a")['href'],
                        "text":
                        description,
                        "company_id":
                        "rtx",
                        "news_url_uid":
                        hashlib.md5(
                            ("https://www.rtx.com" +
                             box.find("a")['href']).encode()).hexdigest()
                    })
                    bulk_obj.insert(datadict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, self.news_collection)

            if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0:
                bulk_obj.execute()
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#19
0
    def crawler_news(self):
        try:
            response = crawler.MakeRequest(self.url,
                                           'Get',
                                           postData=self.body,
                                           headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, self.news_collection)
            news_data = soup.find('div', {'id': 'tab_news_release'})
            if news_data:
                for news in news_data.find_all('dt'):
                    news_dict = Helper.get_news_dict()

                    title_data = news.find_next_sibling().find_next_sibling().a
                    title = title_data.text if title_data else ""

                    url_data = news.find_next_sibling().find_next_sibling().a
                    url = url_data['href'] if url_data else ''

                    # Check if already present
                    unqUrl = hashlib.md5(url.encode()).hexdigest()
                    chkIsExists = DbOperations.GetData(
                        self.news_collection, {"news_url_uid": str(unqUrl)},
                        {}, QueryType.one)
                    if (chkIsExists):
                        print("Already saved. url - ( " + url + " )")
                        continue

                    publish_date_data = news.text if news.text != '' else ''
                    publish_date = Helper.parse_date(publish_date_data)

                    news_dict.update({
                        "title":
                        title,
                        "news_title_uid":
                        hashlib.md5(title.encode()).hexdigest(),
                        "url":
                        url,
                        "link":
                        url,
                        "news_url_uid":
                        hashlib.md5(url.encode()).hexdigest(),
                        "publishedAt":
                        publish_date,
                        'date':
                        publish_date,
                        "publishedAt_scrapped":
                        publish_date,
                        "company_id":
                        "fujielectric",
                        "ticker":
                        "fujielectric_scrapped",
                        "industry_name":
                        "fujielectric",
                        "news_provider":
                        "fujielectric"
                    })

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, self.news_collection)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()
            else:
                print("All news has been scrapped !!")
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#20
0
    def crawler_news(self):
        try:
            loop = True
            page = 1
            while loop:
                try:
                    response = crawler.MakeRequest(self.url.format(page=page),
                                                   'Get',
                                                   postData=self.body,
                                                   headers=self.headers)
                    soup = BeautifulSoup(response.content, 'html.parser')
                    bulk_obj = DbOperations.Get_object_for_bulkop(
                        False, self.news_collection)
                    news_data = soup.find_all('div', {'class': "media"})
                    if news_data:
                        for news in news_data:
                            news_dict = Helper.get_news_dict()

                            title_data = news.find('h4',
                                                   {'class': 'media-heading'})
                            title = title_data.text if title_data else ""

                            url_data = news.find('a', {'href': True})
                            url = "https://www.hbfuller.com" + str(
                                url_data['href']) if url_data else ''

                            # Check if already present
                            unqUrl = hashlib.md5(url.encode()).hexdigest()
                            chkIsExists = DbOperations.GetData(
                                self.news_collection,
                                {"news_url_uid": str(unqUrl)}, {},
                                QueryType.one)
                            if (chkIsExists):
                                print("Already saved. url - ( " + url + " )")
                                continue

                            publish_date_data = news.find(
                                'div', {'class': 'listing-date'})
                            publish_date = Helper.parse_date(
                                str(publish_date_data.text).strip()
                            ) if publish_date_data and publish_date_data.text != '' else ''

                            url_response = crawler.MakeRequest(
                                url,
                                'Get',
                                postData=self.body,
                                headers=self.headers)
                            url_soup = BeautifulSoup(url_response.content,
                                                     'html.parser')
                            description_data = url_soup.find(
                                'div', {
                                    'class': 'row ar-body'
                                }).find('div', {
                                    'class': "col-xs-12 col-sm-8 col-md-9"
                                }).find('div', {
                                    'class': 'col-sm-12'
                                }).find('div', {'style': ''})
                            description = description_data.text.strip().split(
                                '\n')
                            description = ''.join(description[1:])

                            news_dict.update({
                                "title":
                                title,
                                "news_title_uid":
                                hashlib.md5(title.encode()).hexdigest(),
                                "url":
                                url,
                                "link":
                                url,
                                "news_url_uid":
                                hashlib.md5(url.encode()).hexdigest(),
                                "description":
                                description,
                                "text":
                                description,
                                "publishedAt":
                                publish_date,
                                'date':
                                publish_date,
                                "publishedAt_scrapped":
                                publish_date,
                                "company_id":
                                "hbfuller",
                                "ticker":
                                "hbfuller_scrapped",
                                "industry_name":
                                "hbfuller",
                                "news_provider":
                                "hbfuller"
                            })

                            bulk_obj.insert(news_dict)

                            if len(bulk_obj._BulkOperationBuilder__bulk.
                                   __dict__['ops']) > 100:
                                bulk_obj.execute()
                                bulk_obj = DbOperations.Get_object_for_bulkop(
                                    False, self.news_collection)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 0:
                            bulk_obj.execute()

                        page += 1
                    else:
                        print("All news has been scrapped !!")
                        loop = False
                except AttributeError as e:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#21
0
    def crawler(self):
        try:
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, self.news_collection)
            response = crawler.MakeRequest(self.url, "Get")
            soup = BeautifulSoup(response.content, "html.parser")
            boxs = soup.find_all("div",
                                 {"class": 'listCol sort-item news-item'})
            for box in boxs:
                datadict = Helper.get_news_dict()
                datadict.update({
                    "url":
                    "https://www.adlinktech.com" + box.find("a")['href']
                })
                url = "https://www.adlinktech.com" + box.find("a")['href']
                # Check if already present
                unqUrl = hashlib.md5(url.encode()).hexdigest()
                chkIsExists = DbOperations.GetData(
                    self.news_collection, {"news_url_uid": str(unqUrl)}, {},
                    QueryType.one)
                if (chkIsExists):
                    print("Already saved. url - ( " + url + " )")
                    continue
                date, description = self.fetchDescription(
                    "https://www.adlinktech.com" + box.find("a")['href'])
                datadict.update({
                    "date":
                    Helper.parse_date(date),
                    "news_provider":
                    "adlink",
                    "formatted_sub_header":
                    box.find("div", {
                        "class": "contentText"
                    }).text,
                    "publishedAt":
                    Helper.parse_date(date),
                    "description":
                    description,
                    "title":
                    box.find("div", {
                        "class": "contentText"
                    }).text,
                    "link":
                    "https://www.adlinktech.com" + box.find("a")['href'],
                    "ticker":
                    "adlink_scrapped",
                    "industry_name":
                    "adlink",
                    "news_title_uid":
                    hashlib.md5(
                        box.find("div", {
                            "class": "contentText"
                        }).text.encode()).hexdigest(),
                    "text":
                    description,
                    "company_id":
                    "adlink",
                    "news_url_uid":
                    hashlib.md5(("https://www.adlinktech.com" +
                                 box.find("a")['href']).encode()).hexdigest()
                })
                bulk_obj.insert(datadict)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 100:
                    bulk_obj.execute()
                    bulk_obj = DbOperations.Get_object_for_bulkop(
                        False, self.news_collection)

            if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0:
                bulk_obj.execute()
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#22
0
    def crawler_news(self):
        """
                This function will scrap news page wise for given url
                :return:
        """
        try:
            loop = True
            page = 0
            while loop:
                response = crawler.MakeRequest(self.url.format(page=page),
                                               'Get',
                                               postData=self.body,
                                               headers=self.headers)
                soup = BeautifulSoup(response.content, 'html.parser')
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_data = soup.find_all(
                    'div', {
                        'class':
                        "coh-column zmb-44 coh-visible-xs coh-col-xs-12 coh-visible-md coh-col-md-6 coh-col-md-push-0 coh-col-md-pull-0 coh-col-md-offset-0"
                    })
                if news_data:
                    for news in news_data:
                        news_dict = Helper.get_news_dict()

                        title_data = news.find('a')
                        title = title_data.text if title_data else ""

                        url_data = news.find('a', {'href': True})
                        url = "https://www.zscaler.com/" + str(
                            url_data['href']) if url_data else ''

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue

                        publish_date_data = news.find(
                            'time', {'class': 'text-center bg-sea-green'})
                        publish_date = Helper.parse_date(
                            publish_date_data.text
                        ) if publish_date_data and publish_date_data.text != '' else ''

                        url_response = crawler.MakeRequest(
                            url,
                            'Get',
                            postData=self.body,
                            headers=self.headers)
                        url_soup = BeautifulSoup(url_response.content,
                                                 'html.parser')
                        description_data = url_soup.find_all(
                            'div', {'class': "col-sm-12"})

                        description = []
                        regex = re.compile(r'[\n\xa0]')
                        for desc in description_data:
                            if desc.find(
                                    'a',
                                {'href': 'https://www.zscaler.com/'}) != None:
                                description.append(
                                    regex.sub("", str(desc.text)))
                        description = ''.join(description)

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "url":
                            url,
                            "link":
                            url,
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest(),
                            "description":
                            description,
                            "text":
                            description,
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "zscaler",
                            "ticker":
                            "zscaler_scrapped",
                            "industry_name":
                            "zscaler",
                            "news_provider":
                            "zscaler"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()

                    page += 1
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#23
0
    def crawler_news(self):
        """
        This function will scrap news page wise for given url
        :return:
        """

        try:
            response = crawler.MakeRequest(self.url,
                                           'Get',
                                           postData=self.body,
                                           headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, self.news_collection)
            news_data = soup.find_all('li', {'class': "NewsPanel__item"})
            if news_data:
                for news in news_data:
                    try:
                        news_dict = Helper.get_news_dict()

                        title_data = news.find('div',
                                               {'class': 'NewsPanel__body'})
                        title = title_data.text.strip() if title_data else ""

                        url_data = news.find('a', {'href': True})
                        url = "https://www.kaneka.co.jp/" + str(
                            url_data['href']) if url_data else ''

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue

                        publish_date_data = news.find(
                            'time', {'class': 'NewsPanel__time'})
                        publish_date = Helper.parse_date(
                            publish_date_data.text
                        ) if publish_date_data else ''

                        url_response = crawler.MakeRequest(
                            url,
                            'Get',
                            postData=self.body,
                            headers=self.headers)
                        url_soup = BeautifulSoup(url_response.content,
                                                 'html.parser')
                        description_data = url_soup.find(
                            'article', {'class': 'articleBody topics__mod'})

                        description = []
                        regex = re.compile(r'[\n\xa0]')
                        for desc in description_data.find_all('p'):
                            description.append(regex.sub("", str(desc.text)))
                        description = ''.join(description)

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "url":
                            url,
                            "link":
                            url,
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest(),
                            "description":
                            description,
                            "text":
                            description,
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "kaneka",
                            "ticker":
                            "kaneka_scrapped",
                            "industry_name":
                            "kaneka",
                            "news_provider":
                            "kaneka"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    except Exception as e:
                        self.logger.error(f"Error Occured : \n", exc_info=True)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()
            else:
                print("All news has been scrapped !!")
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#24
0
    def crawler_news(self):
        try:
            response = crawler.MakeRequest(self.url,
                                           'Get',
                                           postData=self.body,
                                           headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, self.news_collection)
            news_data = soup.find_all(
                'div', class_='small-12 columns event-post-info')

            if news_data:
                for news in news_data:
                    news_dict = Helper.get_news_dict()

                    title_data = news.find('h6')
                    title = title_data.text if title_data else ""

                    url_data = news.find('a', {'href': True})
                    url = str(url_data['href']) if url_data else ''

                    # Check if already present
                    unqUrl = hashlib.md5(url.encode()).hexdigest()
                    chkIsExists = DbOperations.GetData(
                        self.news_collection, {"news_url_uid": str(unqUrl)},
                        {}, QueryType.one)
                    if (chkIsExists):
                        print("Already saved. url - ( " + url + " )")
                        continue

                    publish_date_data = news.find('span', class_='date caps')
                    publish_date = Helper.parse_date(
                        publish_date_data.text
                    ) if publish_date_data and publish_date_data.text != '' else ''

                    url_response = crawler.MakeRequest(url,
                                                       'Get',
                                                       postData=self.body,
                                                       headers=self.headers)
                    url_soup = BeautifulSoup(url_response.content,
                                             'html.parser')

                    description_data = url_soup.find('div', {'class': ""})

                    description = []
                    regex = re.compile(r'[\n\xa0]')
                    for string in description_data.stripped_strings:
                        description.append(regex.sub("", str(string.strip())))
                    description = ''.join(description)

                    news_dict.update({
                        "title":
                        title,
                        "news_title_uid":
                        hashlib.md5(title.encode()).hexdigest(),
                        "url":
                        url,
                        "link":
                        url,
                        "news_url_uid":
                        hashlib.md5(url.encode()).hexdigest(),
                        "description":
                        description,
                        "text":
                        description,
                        "publishedAt":
                        publish_date,
                        'date':
                        publish_date,
                        "publishedAt_scrapped":
                        publish_date,
                        "company_id":
                        "alertlogic",
                        "topic_name":
                        "press-releases",
                        "ticker":
                        "alertlogic_scrapped",
                        "industry_name":
                        "alertlogic",
                        "news_provider":
                        "alertlogic"
                    })

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, self.news_collection)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()
            else:
                print("All news has been scrapped !!")
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#25
0
    def crawler_news(self):
        """
                This function will scrap news page wise for given url
                :return:
        """
        try:
            loop = True
            page = 1
            while loop:
                response = crawler.MakeRequest(self.url.format(page=page),
                                               'Get',
                                               postData=self.body)
                soup = BeautifulSoup(response.content, 'html.parser')
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_list = soup.find('div', {'class': 'NewsList'})
                if news_list:
                    news_data = news_list.find_all('li')
                    if news_data:
                        for news in news_data:
                            news_dict = Helper.get_news_dict()

                            title_data = news.find('div', {'class': 'title'})
                            title = title_data.text if title_data else ""

                            url_data = news.find('a', {'href': True})
                            url = "https://www.xinyiglass.com/" + str(
                                url_data['href']) if url_data else ''

                            # Check if already present
                            unqUrl = hashlib.md5(url.encode()).hexdigest()
                            chkIsExists = DbOperations.GetData(
                                self.news_collection,
                                {"news_url_uid": str(unqUrl)}, {},
                                QueryType.one)
                            if (chkIsExists):
                                print("Already saved. url - ( " + url + " )")
                                continue

                            regex = re.compile(r'[\n\r\t]')
                            description_data = news.find(
                                'div', {'class': 'info'})
                            description = regex.sub(
                                "", description_data.text
                            ) if description_data else ''

                            date = news.find('span')
                            year_month = news.find('em')
                            publish_date = Helper.parse_date(
                                (year_month.text) + "-" +
                                str(date.text)) if date and year_month else ''

                            news_dict.update({
                                "title":
                                title,
                                "news_title_uid":
                                hashlib.md5(title.encode()).hexdigest(),
                                "url":
                                url,
                                "link":
                                url,
                                "news_url_uid":
                                hashlib.md5(url.encode()).hexdigest(),
                                "description":
                                description.strip(),
                                "text":
                                description.strip(),
                                "publishedAt":
                                publish_date,
                                'date':
                                publish_date,
                                "publishedAt_scrapped":
                                publish_date,
                                "company_id":
                                "xinyiglass",
                                "ticker":
                                "xinyiglass_scrapped",
                                "industry_name":
                                "xinyiglass",
                                "news_provider":
                                "xinyiglass"
                            })

                            bulk_obj.insert(news_dict)

                            if len(bulk_obj._BulkOperationBuilder__bulk.
                                   __dict__['ops']) > 100:
                                bulk_obj.execute()
                                bulk_obj = DbOperations.Get_object_for_bulkop(
                                    False, self.news_collection)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 0:
                            bulk_obj.execute()

                        page += 1
                    else:
                        print("All news has been scrapped !!")
                        loop = False
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#26
0
    def crawler_news(self):
        try:
            response = crawler.MakeRequest(self.url,
                                           'Get',
                                           postData=self.body,
                                           headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, self.news_collection)
            news_data = soup.find(
                'div', {'class': "content-listing__items glide__slides"})
            if news_data:
                for news in news_data.find_all(
                        'a',
                    {'class': 'content-listing__item glide__slide col-lg-3'}):
                    news_dict = Helper.get_news_dict()
                    regex = re.compile(r'[\r\n\xa0]')

                    title_data = news.find('h3')
                    title = regex.sub("", str(
                        title_data.text.strip())) if title_data else ""

                    url = "https://www.enersys.com" + str(news['href'])

                    # Check if already present
                    unqUrl = hashlib.md5(url.encode()).hexdigest()
                    chkIsExists = DbOperations.GetData(
                        self.news_collection, {"news_url_uid": str(unqUrl)},
                        {}, QueryType.one)
                    if (chkIsExists):
                        print("Already saved. url - ( " + url + " )")
                        continue

                    publish_date_data = news.find(
                        'p', {'class': 'content-listing__item-date'})
                    publish_date = Helper.parse_date(
                        publish_date_data.text.strip()
                    ) if publish_date_data and publish_date_data.text != '' else ''

                    url_response = crawler.MakeRequest(url,
                                                       'Get',
                                                       postData=self.body,
                                                       headers=self.headers)
                    url_soup = BeautifulSoup(url_response.content,
                                             'html.parser')
                    description_data = url_soup.find(
                        'div', {'class': "standard-page__body"})

                    description = []

                    for desc in description_data.find_all('p'):
                        description.append(regex.sub("", str(desc.text)))
                    description = ''.join(description)

                    news_dict.update({
                        "title":
                        title,
                        "news_title_uid":
                        hashlib.md5(title.encode()).hexdigest(),
                        "url":
                        url,
                        "link":
                        url,
                        "news_url_uid":
                        hashlib.md5(url.encode()).hexdigest(),
                        "description":
                        description,
                        "text":
                        description,
                        "publishedAt":
                        publish_date,
                        'date':
                        publish_date,
                        "publishedAt_scrapped":
                        publish_date,
                        "company_id":
                        "enersys",
                        "ticker":
                        "enersys_scrapped",
                        "industry_name":
                        "enersys",
                        "news_provider":
                        "enersys"
                    })

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, self.news_collection)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()

            else:
                print("All news has been scrapped !!")
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#27
0
    def crawler(self):
        try:
            data = []
            counter = 1
            while True:
                response = crawler.MakeRequest(
                    self.url.format(counter=counter), "Get")
                if response.status_code == 200:
                    soup = BeautifulSoup(response.content, "html.parser")

                    boxs = soup.find_all("div",
                                         {"class": 'unicom-newsListItem'})
                    for box in boxs:
                        date = box.find("p", {
                            "class": "unicom-listInformationDate"
                        }).text
                        if date:
                            date = Helper.parse_date(date)
                            if date.year < datetime.datetime.now().year:
                                break
                        datadict = Helper.get_news_dict()
                        url = box.find("a")['href']
                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue
                        datadict.update({"newsurl": box.find("a")['href']})
                        description = self.fetchDescription(
                            box.find("a")['href'])
                        datadict.update({
                            "url":
                            url,
                            "link":
                            url,
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest(),
                            "date":
                            box.find("p", {
                                "class": "unicom-listInformationDate"
                            }).text,
                            "news_provider":
                            "panasonic",
                            "formatted_sub_header":
                            box.find("h3", {
                                "class": "unicom-newsListTitleIn"
                            }).text,
                            "publishedAt":
                            date,
                            "description":
                            description,
                            "title":
                            box.find("h3", {
                                "class": "unicom-newsListTitleIn"
                            }).text
                        })

                        data.append(datadict)
                    counter += counter
                    self.url = "https://news.panasonic.com/global/all/all_{counter}.html"
                else:
                    break
            DbOperations.InsertIntoMongo(self.news_collection, data)
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#28
0
    def crawler_news(self):
        try:
            loop = True
            offset = 0
            while loop:
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                response = crawler.MakeRequest(
                    self.url,
                    'Post',
                    postData=self.body.format(off_set=offset),
                    headers=self.headers)
                if response is not None:
                    news_data = json.loads(response.content.decode('utf-8'))

                    if news_data.__contains__(
                            'count') and news_data['count'] > 0:
                        for news in news_data['pages']['items']:
                            print(news)
                            date = Helper.parse_date(news['news_date'])
                            if date:
                                if date.year < datetime.datetime.now().year:
                                    break

                            url = "https://www.infineon.com/" + news['url']
                            # Check if already present
                            unqUrl = hashlib.md5(url.encode()).hexdigest()
                            chkIsExists = DbOperations.GetData(
                                self.news_collection,
                                {"news_url_uid": str(unqUrl)}, {},
                                QueryType.one)
                            if (chkIsExists):
                                print("Already saved. url - ( " + url + " )")
                                continue

                            news_dict = Helper.get_news_dict()
                            description = self.fetchDescription(
                                "https://www.infineon.com/" + news['url'])
                            news_dict.update({
                                "date":
                                Helper.parse_date(news['news_date']),
                                "news_provider":
                                "Infineon",
                                "url":
                                "https://www.infineon.com/" + news['url'],
                                "formatted_sub_header":
                                "",
                                "publishedAt":
                                Helper.parse_date(news['news_date']),
                                "description":
                                description,
                                "title":
                                news['title'],
                                "ticker":
                                "Infineon_scrapped",
                                "industry_name":
                                "Infineon",
                                "news_title_uid":
                                hashlib.md5(
                                    news['title'].encode()).hexdigest(),
                                "link":
                                "https://www.infineon.com/" + news['url'],
                                "text":
                                description,
                                "company_id":
                                "Infineon",
                                "news_url_uid":
                                hashlib.md5(
                                    ("https://www.infineon.com/" +
                                     news['url']).encode()).hexdigest()
                            })

                            bulk_obj.insert(news_dict)
                            if len(bulk_obj._BulkOperationBuilder__bulk.
                                   __dict__['ops']) > 1:
                                bulk_obj.execute()
                                bulk_obj = DbOperations.Get_object_for_bulkop(
                                    False, self.news_collection)
                    else:
                        print("No data found")
                        loop = False
                    offset += 10
                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()

                else:
                    break
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
示例#29
0
    def crawler_news(self):
        try:
            for page in range(0, 5):
                url = baseurl + str(page)
                self.url = url
                response = crawler.MakeRequest(self.url,
                                               'Get',
                                               postData=self.body,
                                               headers=self.headers)
                soup = BeautifulSoup(response.content, 'html.parser')
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_body = soup.find('tbody')
                news_data = news_body.find_all('tr')

                if news_data:
                    for news in news_data:
                        news_dict = Helper.get_news_dict()

                        news_header = news.find(
                            'td',
                            class_=
                            'views-field views-field-field-nir-news-title')

                        title_data = news_header.find('a')
                        title = title_data.text if title_data else ""

                        url_data = news_header.find('a', {'href': True})
                        url = "https://investors.accuray.com" + str(
                            url_data['href']) if url_data else ''

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue

                        date_header = news.find(
                            'td',
                            class_='views-field views-field-field-nir-news-date'
                        )

                        publish_date_data = date_header.find('time',
                                                             class_='datetime')
                        publish_date = Helper.parse_date(
                            publish_date_data.text
                        ) if publish_date_data and publish_date_data.text != '' else ''

                        url_response = crawler.MakeRequest(
                            url,
                            'Get',
                            postData=self.body,
                            headers=self.headers)
                        url_soup = BeautifulSoup(url_response.content,
                                                 'html.parser')

                        description_data = url_soup.find(
                            'div', {'class': "xn-content"})

                        description = []
                        regex = re.compile(r'[\n\xa0]')
                        for desc in description_data.find_all('p'):
                            description.append(
                                regex.sub("", str(desc.text.strip())))
                        description = ''.join(description)

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "url":
                            url,
                            "link":
                            url,
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest(),
                            "description":
                            description,
                            "text":
                            description,
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "accuracy",
                            "ticker":
                            "accuracy_scrapped",
                            "industry_name":
                            "accuracy",
                            "news_provider":
                            "accuracy"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()
                else:
                    print("All news has been scrapped !!")
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)