Python MakeRequest示例，crawler.crawler.MakeRequest Python示例

示例#1

0

显示文件

    def crawler_news(self):
        response = crawler.MakeRequest(self.url,
                                       'Post',
                                       postData=self.body,
                                       headers=self.headers)
        news_json = json.loads(response.content.decode('utf-8'))
        if news_json:
            soup = BeautifulSoup(news_json['data'], 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, 'shinetsu_news')
            news_data = soup.find_all('div', {'class': "item"})
            if news_data:
                for news in news_data:
                    news_dict = Helper.get_news_dict()

                    title_data = news.find('div', {'class': 'title'})
                    title = title_data.text.strip().split(
                        '\n')[0] if title_data else ""

                    url_data = news.find('a', {'href': True})
                    url = url_data['href'] if url_data else ''

                    publish_date_data = news.find('p', {'class': 'date'})
                    publish_date = Helper.parse_date(
                        publish_date_data.text
                    ) if publish_date_data and publish_date_data.text != '' else ''

                    if url.split('.')[-1] != 'pdf':
                        url_response = crawler.MakeRequest(
                            url,
                            'Get',
                            postData=self.body,
                            headers=self.headers)
                        url_soup = BeautifulSoup(url_response.content,
                                                 'html.parser')
                        description_data = url_soup.find(
                            'div', {'class': "content-news"})

                        description = []
                        regex = re.compile(r'[\n\xa0]')
                        for desc in description_data.find_all('p'):
                            description.append(regex.sub("", str(desc.text)))
                        description = ''.join(description)
                    else:
                        description = ''

                    news_dict.update({
                        "title":
                        title,
                        "news_title_uid":
                        hashlib.md5(title.encode()).hexdigest(),
                        "url":
                        url,
                        "link":
                        url,
                        "news_url_uid":
                        hashlib.md5(url.encode()).hexdigest(),
                        "description":
                        description,
                        "text":
                        description,
                        "publishedAt":
                        publish_date,
                        'date':
                        publish_date,
                        "publishedAt_scrapped":
                        publish_date,
                        "company_id":
                        "shinetsu",
                        "ticker":
                        "shinetsu_scrapped",
                        "industry_name":
                        "shinetsu",
                        "news_provider":
                        "shinetsu"
                    })

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, 'shinetsu_news')

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()

            else:
                print("News Not Found")

示例#2

0

显示文件

    def crawler_news(self):
        try:
            response = crawler.MakeRequest(self.url,
                                           'Get',
                                           postData=self.body,
                                           headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, self.news_collection)
            news_data = soup.find('div', {'id': "newsRelease"})
            if news_data:
                for news in news_data.find_all(
                        'div', {'class': 'grid-sizer grid-item news--item'}):
                    news_dict = Helper.get_news_dict()

                    title_data = news.find('h4')
                    title = title_data.text if title_data else ""

                    url_data = news.find('a', {'href': True})
                    url = "https://global.canon/" + str(
                        url_data['href']) if url_data else ''

                    # Check if already present
                    unqUrl = hashlib.md5(url.encode()).hexdigest()
                    chkIsExists = DbOperations.GetData(
                        self.news_collection, {"news_url_uid": str(unqUrl)},
                        {}, QueryType.one)
                    if (chkIsExists):
                        print("Already saved. url - ( " + url + " )")
                        continue

                    publish_date_data = news.find('div',
                                                  {'class': 'news-date'})
                    publish_date = Helper.parse_date(
                        publish_date_data.text
                    ) if publish_date_data and publish_date_data.text != '' else ''

                    url_response = crawler.MakeRequest(url,
                                                       'Get',
                                                       postData=self.body,
                                                       headers=self.headers)
                    url_soup = BeautifulSoup(url_response.content,
                                             'html.parser')
                    description_data = url_soup.find('div', {
                        'id': 'news-detail'
                    }).find_all('div', {'class': "col-row"})[1]

                    description = []
                    regex = re.compile(r'[\n\xa0]')
                    for desc in description_data.find_all('p'):
                        description.append(regex.sub("", str(desc.text)))
                    description = ''.join(description)

                    news_dict.update({
                        "title":
                        title,
                        "news_title_uid":
                        hashlib.md5(title.encode()).hexdigest(),
                        "url":
                        url,
                        "link":
                        url,
                        "news_url_uid":
                        hashlib.md5(url.encode()).hexdigest(),
                        "description":
                        description,
                        "text":
                        description,
                        "publishedAt":
                        publish_date,
                        'date':
                        publish_date,
                        "publishedAt_scrapped":
                        publish_date,
                        "company_id":
                        "canon",
                        "ticker":
                        "canon_scrapped",
                        "industry_name":
                        "canon",
                        "news_provider":
                        "canon"
                    })

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, self.news_collection)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()
            else:
                print("All news has been scrapped !!")
                loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)

示例#3

0

显示文件

    def crawler_news(self):
        response = crawler.MakeRequest(self.url,
                                       'Get',
                                       postData=self.body,
                                       headers=self.headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        bulk_obj = DbOperations.Get_object_for_bulkop(False, 'arrow_news')
        news_data = json.loads(response.content.decode('utf-8'))
        if news_data:
            for news in news_data['GetPressReleaseListResult']:
                news_dict = Helper.get_news_dict()

                title = news['Headline'] if 'Headline' in news else ""

                url = "https://news.fiveyearsout.com" + str(
                    news['LinkToDetailPage']
                ) if 'LinkToDetailPage' in news else ''

                publish_date = Helper.parse_date(
                    news['PressReleaseDate']
                ) if 'PressReleaseDate' in news else ''

                url_response = crawler.MakeRequest(url,
                                                   'Get',
                                                   postData=self.body,
                                                   headers=self.headers)
                url_soup = BeautifulSoup(url_response.content, 'html.parser')
                description_data = url_soup.find('div',
                                                 {'class': "module_body"})

                description = []
                regex = re.compile(r'[\n\xa0]')
                for desc in description_data.find_all('p'):
                    description.append(regex.sub("", str(desc.text)))
                description = ''.join(description)

                news_dict.update({
                    "title":
                    title,
                    "news_title_uid":
                    hashlib.md5(title.encode()).hexdigest(),
                    "url":
                    url,
                    "link":
                    url,
                    "news_url_uid":
                    hashlib.md5(url.encode()).hexdigest(),
                    "description":
                    description,
                    "text":
                    description,
                    "publishedAt":
                    publish_date,
                    'date':
                    publish_date,
                    "publishedAt_scrapped":
                    publish_date,
                    "company_id":
                    "arrow",
                    "ticker":
                    "arrow_scrapped",
                    "industry_name":
                    "arrow",
                    "news_provider":
                    "arrow"
                })

                bulk_obj.insert(news_dict)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 100:
                    bulk_obj.execute()
                    bulk_obj = DbOperations.Get_object_for_bulkop(
                        False, 'arrow_news')

            if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0:
                bulk_obj.execute()
        else:
            print("News Not Found")

示例#4

0

显示文件

文件： tereos.py 项目： Kashyap10/NewsScrapping

    def crawler_news(self):
        try:
            loop = True
            page = 1
            while loop:
                response = crawler.MakeRequest(self.url.format(page=page),
                                               'Get',
                                               postData=self.body,
                                               headers=self.headers)
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_data = json.loads(response.content.decode('utf-8'))
                if news_data:
                    for news in news_data:
                        news_dict = Helper.get_news_dict()

                        title = news['title'][
                            'rendered'] if 'title' in news and 'rendered' in news[
                                'title'] else ''

                        url = news['link'] if 'link' in news else ''

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue

                        publish_date_data = news[
                            'date_gmt'] if 'date_gmt' in news else ''
                        publish_date = Helper.parse_date(publish_date_data)

                        description_data = BeautifulSoup(
                            news['acf']['sections'][0]['text'], 'html.parser')

                        description = []
                        regex = re.compile(r'[\n\xa0]')
                        for desc in description_data.find_all('p'):
                            description.append(regex.sub("", str(desc.text)))
                        description = ''.join(description)

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "url":
                            url,
                            "link":
                            url,
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest(),
                            "description":
                            description,
                            "text":
                            description,
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "tereos",
                            "ticker":
                            "tereos_scrapped",
                            "industry_name":
                            "tereos",
                            "news_provider":
                            "tereos"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()

                    page += 1
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)

示例#5

0

显示文件

    def crawler_news(self):
        try:
            loop = True
            page = 1
            while loop:
                response = crawler.MakeRequest(self.url.format(page=page),
                                               'Get',
                                               postData=self.body,
                                               headers=self.headers)
                if (response.status_code == 200):
                    soup = BeautifulSoup(response.content, 'html.parser')
                else:
                    break
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_data = soup.find('div', {
                    'class': "x-main full"
                }).find_all('div', {'class': 'x-container max width'})
                if news_data:
                    for news in news_data:
                        news_dict = Helper.get_news_dict()

                        title_data = news.find('h2', {'class': 'entry-title'})
                        title = title_data.text.strip() if title_data else ""

                        url_data = news.find('a', {'href': True})
                        url = url_data['href'] if url_data else ''

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue

                        publish_date_data = news.find('time',
                                                      {'class': 'entry-date'})
                        publish_date = Helper.parse_date(
                            publish_date_data.text
                        ) if publish_date_data and publish_date_data.text != '' else ''

                        url_response = crawler.MakeRequest(
                            url,
                            'Get',
                            postData=self.body,
                            headers=self.headers)
                        url_soup = BeautifulSoup(url_response.content,
                                                 'html.parser')
                        description_data = url_soup.find(
                            'div', {'class': "entry-content content"})

                        description = []
                        regex = re.compile(r'[\n\xa0]')
                        if description_data.h2 != None:
                            for desc in description_data.h2.find_all_previous(
                                    "p")[::-1]:
                                description.append(
                                    regex.sub("", str(desc.text)))
                        else:
                            for desc in description_data.find_all('p'):
                                description.append(
                                    regex.sub("", str(desc.text)))
                        description = ''.join(description)

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "url":
                            url,
                            "link":
                            url,
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest(),
                            "description":
                            description,
                            "text":
                            description,
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "titanx",
                            "ticker":
                            "titanx_scrapped",
                            "industry_name":
                            "titanx",
                            "news_provider":
                            "titanx"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()

                    page += 1
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)

示例#6

0

显示文件

    def crawler_news(self):
        response = crawler.MakeRequest(self.url,
                                       'Get',
                                       postData=self.body,
                                       headers=self.headers)
        bulk_obj = DbOperations.Get_object_for_bulkop(False,
                                                      'asahi_kasei_news')
        news_data = json.loads(response.content.decode('utf-8'))
        if news_data:
            for news_list in news_data[0]['2020'][1]['release'][0]['mooth']:
                for news in news_list['item']:
                    news_dict = Helper.get_news_dict()

                    title = news['text'] if 'text' in news else ''

                    url = "https://www.asahi-kasei.com" + str(
                        news['url']) if 'url' in news else ''

                    publish_date_data = news['day'] if 'day' in news else ''
                    publish_date = Helper.parse_date(publish_date_data)

                    url_response = crawler.MakeRequest(url,
                                                       'Get',
                                                       postData=self.body,
                                                       headers=self.headers)
                    url_soup = BeautifulSoup(url_response.content,
                                             'html.parser')
                    description_data = url_soup.find('main', {'class': "main"})

                    description = []
                    regex = re.compile(r'[\n\xa0]')
                    for desc in description_data.find_all('p'):
                        description.append(regex.sub("", str(desc.text)))
                    description = ''.join(description)

                    news_dict.update({
                        "title":
                        title,
                        "news_title_uid":
                        hashlib.md5(title.encode()).hexdigest(),
                        "url":
                        url,
                        "link":
                        url,
                        "news_url_uid":
                        hashlib.md5(url.encode()).hexdigest(),
                        "description":
                        description,
                        "text":
                        description,
                        "publishedAt":
                        publish_date,
                        'date':
                        publish_date,
                        "publishedAt_scrapped":
                        publish_date,
                        "company_id":
                        "asahi_kasei",
                        "ticker":
                        "asahi_kasei_scrapped",
                        "industry_name":
                        "asahi_kasei",
                        "news_provider":
                        "asahi_kasei"
                    })

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, 'asahi_kasei_news')

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()
        else:
            print("News Not Found")

示例#7

0

显示文件

文件： aruplab.py 项目： Arpitshah139/news_scrapping

    def crawler_news(self):
        loop = True
        page = 0
        while loop:
            response = crawler.MakeRequest(self.url.format(page=page),
                                           'Get',
                                           postData=self.body,
                                           headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, 'aruplab_news')
            news_data = soup.find_all('div', {'class': "views-col"})
            if news_data:
                for news in news_data:
                    news_dict = Helper.get_news_dict()

                    title_data = news.find('h4')
                    title = title_data.text if title_data else ""

                    url_data = news.find('a', {'href': True})
                    url = "https://www.aruplab.com" + str(
                        url_data['href']) if url_data else ''

                    publish_date_data = news.find(
                        'span',
                        {'class': 'views-field views-field-field-date'})
                    publish_date = Helper.parse_date(
                        publish_date_data.text
                    ) if publish_date_data and publish_date_data.text != '' else ''

                    url_response = crawler.MakeRequest(url,
                                                       'Get',
                                                       postData=self.body,
                                                       headers=self.headers)
                    url_soup = BeautifulSoup(url_response.content,
                                             'html.parser')
                    description_data = url_soup.find('main', {
                        'role': "main"
                    }).find(
                        'div', {
                            'class':
                            'field field--name-body field--type-text-with-summary field--label-hidden field__item'
                        })

                    description = []
                    regex = re.compile(r'[\n\xa0]')
                    for desc in description_data.find_all('p'):
                        description.append(regex.sub("", str(desc.text)))
                    description = ''.join(description)

                    news_dict.update({
                        "title":
                        title,
                        "news_title_uid":
                        hashlib.md5(title.encode()).hexdigest(),
                        "url":
                        url,
                        "link":
                        url,
                        "news_url_uid":
                        hashlib.md5(url.encode()).hexdigest(),
                        "description":
                        description,
                        "text":
                        description,
                        "publishedAt":
                        publish_date,
                        'date':
                        publish_date,
                        "publishedAt_scrapped":
                        publish_date,
                        "company_id":
                        "aruplab",
                        "ticker":
                        "aruplab_scrapped",
                        "industry_name":
                        "aruplab",
                        "news_provider":
                        "aruplab"
                    })

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, 'aruplab_news')

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()

                page += 1
            else:
                print("News Not Found")
                loop = False

示例#8

0

显示文件

文件： harvardbioscience.py 项目： Arpitshah139/news_scrapping

    def crawler_news(self):
        """
        This function will scrap news page wise for given url
        :return:
        """

        try:
            loop = True
            page = 0
            while loop:
                response = crawler.MakeRequest(self.url.format(page=page),
                                               'Get',
                                               postData=self.body,
                                               headers=self.headers)
                soup = BeautifulSoup(response.content, 'html.parser')
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, 'harvardbioscience_news')
                news_data = soup.find('tbody')
                if news_data:
                    for news in news_data.find_all('tr'):
                        try:
                            news_dict = Helper.get_news_dict()

                            title_data = news.find(
                                'td', {
                                    'class':
                                    'views-field views-field-field-nir-news-title'
                                }).find('a', {'href': True})
                            title = title_data.text.strip(
                            ) if title_data else ""

                            url_data = news.find(
                                'td', {
                                    'class':
                                    'views-field views-field-field-nir-news-title'
                                }).find('a', {'href': True})
                            url = "https://investor.harvardbioscience.com" + str(
                                url_data['href']) if url_data else ''

                            publish_date_data = news.find(
                                'time', {'class': 'datetime'})
                            publish_date = Helper.parse_date(
                                publish_date_data.text
                            ) if publish_date_data else ''

                            url_response = crawler.MakeRequest(
                                url,
                                'Get',
                                postData=self.body,
                                headers=self.headers)
                            url_soup = BeautifulSoup(url_response.content,
                                                     'html.parser')
                            description_data = url_soup.find(
                                'div', {'class': 'node__content'})

                            description = []
                            regex = re.compile(r'[\n\xa0]')
                            for desc in description_data.find_all('p'):
                                description.append(
                                    regex.sub("", str(desc.text)))
                            description = ''.join(description)

                            news_dict.update({
                                "title":
                                title,
                                "news_title_uid":
                                hashlib.md5(title.encode()).hexdigest(),
                                "url":
                                url,
                                "link":
                                url,
                                "news_url_uid":
                                hashlib.md5(url.encode()).hexdigest(),
                                "description":
                                description,
                                "text":
                                description,
                                "publishedAt":
                                publish_date,
                                'date':
                                publish_date,
                                "publishedAt_scrapped":
                                publish_date,
                                "company_id":
                                "harvardbioscience",
                                "ticker":
                                "harvardbioscience_scrapped",
                                "industry_name":
                                "harvardbioscience",
                                "news_provider":
                                "harvardbioscience"
                            })

                            bulk_obj.insert(news_dict)

                            if len(bulk_obj._BulkOperationBuilder__bulk.
                                   __dict__['ops']) > 100:
                                bulk_obj.execute()
                                bulk_obj = DbOperations.Get_object_for_bulkop(
                                    False, 'harvardbioscience_news')

                        except Exception as e:
                            self.logger.error(f"Error Occured : \n",
                                              exc_info=True)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()

                    page += 1
                else:
                    print("News Not Found")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)

示例#9

0

显示文件

    def crawler_news(self):
        try:
            response = crawler.MakeRequest(self.url,
                                           'Get',
                                           postData=self.body,
                                           headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, self.news_collection)
            news_data = soup.find_all(
                'div', class_='small-12 columns event-post-info')

            if news_data:
                for news in news_data:
                    news_dict = Helper.get_news_dict()

                    title_data = news.find('h6')
                    title = title_data.text if title_data else ""

                    url_data = news.find('a', {'href': True})
                    url = str(url_data['href']) if url_data else ''

                    # Check if already present
                    unqUrl = hashlib.md5(url.encode()).hexdigest()
                    chkIsExists = DbOperations.GetData(
                        self.news_collection, {"news_url_uid": str(unqUrl)},
                        {}, QueryType.one)
                    if (chkIsExists):
                        print("Already saved. url - ( " + url + " )")
                        continue

                    publish_date_data = news.find('span', class_='date caps')
                    publish_date = Helper.parse_date(
                        publish_date_data.text
                    ) if publish_date_data and publish_date_data.text != '' else ''

                    url_response = crawler.MakeRequest(url,
                                                       'Get',
                                                       postData=self.body,
                                                       headers=self.headers)
                    url_soup = BeautifulSoup(url_response.content,
                                             'html.parser')

                    description_data = url_soup.find('div', {'class': ""})

                    description = []
                    regex = re.compile(r'[\n\xa0]')
                    for string in description_data.stripped_strings:
                        description.append(regex.sub("", str(string.strip())))
                    description = ''.join(description)

                    news_dict.update({
                        "title":
                        title,
                        "news_title_uid":
                        hashlib.md5(title.encode()).hexdigest(),
                        "url":
                        url,
                        "link":
                        url,
                        "news_url_uid":
                        hashlib.md5(url.encode()).hexdigest(),
                        "description":
                        description,
                        "text":
                        description,
                        "publishedAt":
                        publish_date,
                        'date':
                        publish_date,
                        "publishedAt_scrapped":
                        publish_date,
                        "company_id":
                        "alertlogic",
                        "topic_name":
                        "press-releases",
                        "ticker":
                        "alertlogic_scrapped",
                        "industry_name":
                        "alertlogic",
                        "news_provider":
                        "alertlogic"
                    })

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, self.news_collection)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()
            else:
                print("All news has been scrapped !!")
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)

示例#10

0

显示文件

文件： voestalpine.py 项目： Arpitshah139/news_scrapping

    def crawler_news(self):
        response = crawler.MakeRequest(self.url,
                                       'Get',
                                       postData=self.body,
                                       headers=self.headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        bulk_obj = DbOperations.Get_object_for_bulkop(False,
                                                      'voestalpine_news')
        article_data = soup.find_all('div', {'class': 'article'})
        if article_data:
            for article in article_data:
                news_data = article.find_all('section', {'class': ""})
                for news in news_data[1::2]:

                    news_dict = Helper.get_news_dict()

                    title_data = news.find('h2')
                    title = title_data.text if title_data else ""

                    url = news.find_next_sibling(
                    ).a['href'] if news.find_next_sibling() else ''

                    description_data = news.find('p')
                    description = description_data.text if description_data else ''

                    url_response = crawler.MakeRequest(url,
                                                       'Get',
                                                       postData=self.body,
                                                       headers=self.headers)
                    url_soup = BeautifulSoup(url_response.content,
                                             'html.parser')
                    publish_date_data = url_soup.find(
                        'p', {'class': 'meta large inline'})
                    publish_date = Helper.parse_date(
                        publish_date_data.text.replace('|', "").strip()
                    ) if publish_date_data and publish_date_data.text != '' else ''

                    news_dict.update({
                        "title":
                        title,
                        "news_title_uid":
                        hashlib.md5(title.encode()).hexdigest(),
                        "url":
                        url,
                        "link":
                        url,
                        "news_url_uid":
                        hashlib.md5(url.encode()).hexdigest(),
                        "description":
                        description,
                        "text":
                        description,
                        "publishedAt":
                        publish_date,
                        'date':
                        publish_date,
                        "publishedAt_scrapped":
                        publish_date,
                        "company_id":
                        "voestalpine",
                        "ticker":
                        "voestalpine_scrapped",
                        "industry_name":
                        "voestalpine",
                        "news_provider":
                        "voestalpine"
                    })

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, 'voestalpine_news')

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()

示例#11

0

显示文件

文件： polyone.py 项目： Kashyap10/NewsScrapping

    def crawler_news(self):
        try:
            loop = True
            page = 1
            while loop:
                response = crawler.MakeRequest(self.url.format(page=page),
                                               'Get',
                                               postData=self.body,
                                               headers=self.headers)
                soup = BeautifulSoup(response.content, 'html.parser')
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_data = soup.find('div', {'class': 'block-region-results'})
                if news_data:
                    for news in news_data.find_all('tr'):
                        news_dict = Helper.get_news_dict()

                        title_data = news.find('h3')
                        title = title_data.text if title_data else ""

                        url_data = news.find('a', {'href': True})
                        url = "https://www.polyone.com/" + str(
                            url_data['href']) if url_data else ''

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue

                        publish_date_data = news.find('h5',
                                                      {'class': 'float-left'})
                        publish_date = Helper.parse_date(
                            str(publish_date_data.text).split('\n')[0]
                        ) if publish_date_data and publish_date_data.text != '' else ''

                        url_response = crawler.MakeRequest(
                            url,
                            'Get',
                            postData=self.body,
                            headers=self.headers)
                        url_soup = BeautifulSoup(url_response.content,
                                                 'html.parser')
                        description_data = url_soup.find(
                            'div', {'class': "block-region-top"})
                        description_data = description_data.strong.find_all_previous(
                            'p')[1:-3]
                        description_data.reverse()

                        description = []
                        regex = re.compile(r'[\n\xa0]')
                        for desc in description_data:
                            description.append(regex.sub("", str(desc.text)))
                        description = ''.join(description)

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "url":
                            url,
                            "link":
                            url,
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest(),
                            "description":
                            description,
                            "text":
                            description,
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "polyone",
                            "ticker":
                            "polyone_scrapped",
                            "industry_name":
                            "polyone",
                            "news_provider":
                            "polyone"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()

                    page += 1
                    break
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)

示例#12

0

显示文件

    def crawler_news(self):
        try:
            loop = True
            page = 1
            while loop:
                response = crawler.MakeRequest(self.url.format(page=page),
                                               'Get',
                                               postData=self.body,
                                               headers=self.headers)
                if response.headers[
                        'Content-Type'] == 'application/json; charset=utf-8':
                    response_json = json.loads(
                        response.content.decode('utf-8'))
                else:
                    print("No data found")
                    break
                soup = BeautifulSoup(response_json['html'], 'html.parser')
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_data = soup.find_all(
                    'li', {
                        'class':
                        'page-list--item is-detailed infinite-nodes--list-item'
                    })
                if news_data:
                    for news in news_data:
                        news_dict = Helper.get_news_dict()

                        title_data = news.find('h3')
                        title = title_data.text.strip() if title_data else ""

                        url_data = news.find('a', {'href': True})
                        url = url_data['href'] if url_data else ''

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue

                        publish_date_data = news.find('div',
                                                      {'class': 'meta--item'})
                        publish_date = Helper.parse_date(
                            publish_date_data.text
                        ) if publish_date_data and publish_date_data.text != '' else ''

                        description_data = news.find(
                            'div', {'class': "page-list--text"})
                        description = description_data.text.strip(
                        ) if description_data else ''

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "url":
                            url,
                            "link":
                            url,
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest(),
                            "description":
                            description,
                            "text":
                            description,
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "audi",
                            "ticker":
                            "audi_scrapped",
                            "industry_name":
                            "audi",
                            "news_provider":
                            "audi"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()

                    page += 1
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)

示例#13

0

显示文件

    def crawler_news(self):
        """
                This function will scrap news page wise for given url
                :return:
        """
        try:
            loop = True
            page = 1
            while loop:
                response = crawler.MakeRequest(self.url.format(page=page),
                                               'Get',
                                               postData=self.body)
                soup = BeautifulSoup(response.content, 'html.parser')
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_list = soup.find('div', {'class': 'NewsList'})
                if news_list:
                    news_data = news_list.find_all('li')
                    if news_data:
                        for news in news_data:
                            news_dict = Helper.get_news_dict()

                            title_data = news.find('div', {'class': 'title'})
                            title = title_data.text if title_data else ""

                            url_data = news.find('a', {'href': True})
                            url = "https://www.xinyiglass.com/" + str(
                                url_data['href']) if url_data else ''

                            # Check if already present
                            unqUrl = hashlib.md5(url.encode()).hexdigest()
                            chkIsExists = DbOperations.GetData(
                                self.news_collection,
                                {"news_url_uid": str(unqUrl)}, {},
                                QueryType.one)
                            if (chkIsExists):
                                print("Already saved. url - ( " + url + " )")
                                continue

                            regex = re.compile(r'[\n\r\t]')
                            description_data = news.find(
                                'div', {'class': 'info'})
                            description = regex.sub(
                                "", description_data.text
                            ) if description_data else ''

                            date = news.find('span')
                            year_month = news.find('em')
                            publish_date = Helper.parse_date(
                                (year_month.text) + "-" +
                                str(date.text)) if date and year_month else ''

                            news_dict.update({
                                "title":
                                title,
                                "news_title_uid":
                                hashlib.md5(title.encode()).hexdigest(),
                                "url":
                                url,
                                "link":
                                url,
                                "news_url_uid":
                                hashlib.md5(url.encode()).hexdigest(),
                                "description":
                                description.strip(),
                                "text":
                                description.strip(),
                                "publishedAt":
                                publish_date,
                                'date':
                                publish_date,
                                "publishedAt_scrapped":
                                publish_date,
                                "company_id":
                                "xinyiglass",
                                "ticker":
                                "xinyiglass_scrapped",
                                "industry_name":
                                "xinyiglass",
                                "news_provider":
                                "xinyiglass"
                            })

                            bulk_obj.insert(news_dict)

                            if len(bulk_obj._BulkOperationBuilder__bulk.
                                   __dict__['ops']) > 100:
                                bulk_obj.execute()
                                bulk_obj = DbOperations.Get_object_for_bulkop(
                                    False, self.news_collection)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 0:
                            bulk_obj.execute()

                        page += 1
                    else:
                        print("All news has been scrapped !!")
                        loop = False
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)

示例#14

0

显示文件

    def crawler_news(self):
        loop = True
        page = 0
        while loop:
            response = crawler.MakeRequest(
                self.url,
                'Post',
                postData=self.body.format(page=page),
                headers=self.headers)
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, 'novozymes_news')
            news_data = json.loads(response.content.decode('utf-8'))
            if news_data['News']:
                for news in news_data['News']:
                    news_dict = Helper.get_news_dict()

                    title = news['Title'] if 'Title' in news else ''

                    url = "https://www.novozymes.com" + str(
                        news['Url']) if 'Url' in news else ''

                    publish_date_data = news[
                        'CreationDate'] if 'CreationDate' in news else ''
                    publish_date = Helper.parse_date(publish_date_data)

                    description = news['Content'] if 'Content' in news else ''

                    news_dict.update({
                        "title":
                        title,
                        "news_title_uid":
                        hashlib.md5(title.encode()).hexdigest(),
                        "url":
                        url,
                        "link":
                        url,
                        "news_url_uid":
                        hashlib.md5(url.encode()).hexdigest(),
                        "description":
                        description,
                        "text":
                        description,
                        "publishedAt":
                        publish_date,
                        'date':
                        publish_date,
                        "publishedAt_scrapped":
                        publish_date,
                        "company_id":
                        "novozymes",
                        "ticker":
                        "novozymes_scrapped",
                        "industry_name":
                        "novozymes",
                        "news_provider":
                        "novozymes"
                    })

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, 'novozymes_news')

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()

                page += 1
            else:
                print("News Not Found")
                loop = False

示例#15

0

显示文件

文件： hanwhacorp.py 项目： Arpitshah139/news_scrapping

    def crawler_news(self):
        loop = True
        page = 1
        while loop:
            response = crawler.MakeRequest(self.url.format(page=page),
                                           'Get',
                                           postData=self.body,
                                           headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, 'hanwhacorp_news')
            news_data = soup.find('tbody')
            if news_data and news_data.tr.text.strip() != 'There is no data.':
                for news in news_data.find_all('tr'):
                    news_dict = Helper.get_news_dict()

                    title_data = news.find('td', {'class': 'title'})
                    title = title_data.text if title_data else ""

                    publish_date_data = news.find_all('td')[3].text
                    publish_date = Helper.parse_date(
                        publish_date_data
                    ) if publish_date_data and publish_date_data != '' else ''

                    news_dict.update({
                        "title":
                        title,
                        "news_title_uid":
                        hashlib.md5(title.encode()).hexdigest(),
                        "publishedAt":
                        publish_date,
                        'date':
                        publish_date,
                        "publishedAt_scrapped":
                        publish_date,
                        "company_id":
                        "hanwhacorp",
                        "ticker":
                        "hanwhacorp_scrapped",
                        "industry_name":
                        "hanwhacorp",
                        "news_provider":
                        "hanwhacorp"
                    })

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, 'hanwhacorp_news')

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()

                page += 1
            else:
                print("News Not Found")
                loop = False

示例#16

0

显示文件

文件： zscaler.py 项目： Arpitshah139/news_scrapping

    def crawler_news(self):
        loop = True
        page = 1
        while loop:
            response = crawler.MakeRequest(self.url.format(page=page),
                                           'Get',
                                           postData=self.body,
                                           headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, 'zscaler_news')
            news_data = soup.find_all(
                'div', {'class': "col-12 col-md-6 zpb-32 zpb-md-48"})
            if news_data:
                for news in news_data:
                    news_dict = Helper.get_news_dict()

                    title_data = news.find('a')
                    title = title_data.text if title_data else ""

                    url_data = news.find('a', {'href': True})
                    url = url_data['href'] if url_data else ''

                    publish_date_data = news.find(
                        'time', {'class': 'text-center bg-sea-green'})
                    publish_date = Helper.parse_date(
                        publish_date_data.text
                    ) if publish_date_data and publish_date_data.text != '' else ''

                    url_response = crawler.MakeRequest(url,
                                                       'Get',
                                                       postData=self.body,
                                                       headers=self.headers)
                    url_soup = BeautifulSoup(url_response.content,
                                             'html.parser')
                    description_data = url_soup.find_all(
                        'div', {'class': "col-sm-12"})

                    description = []
                    regex = re.compile(r'[\n\xa0]')
                    for desc in description_data:
                        if desc.find('a', {'href': 'https://www.zscaler.com/'
                                           }) != None:
                            description.append(regex.sub("", str(desc.text)))
                    description = ''.join(description)

                    news_dict.update({
                        "title":
                        title,
                        "news_title_uid":
                        hashlib.md5(title.encode()).hexdigest(),
                        "url":
                        url,
                        "link":
                        url,
                        "news_url_uid":
                        hashlib.md5(url.encode()).hexdigest(),
                        "description":
                        description,
                        "text":
                        description,
                        "publishedAt":
                        publish_date,
                        'date':
                        publish_date,
                        "publishedAt_scrapped":
                        publish_date,
                        "company_id":
                        "zscaler",
                        "ticker":
                        "zscaler_scrapped",
                        "industry_name":
                        "zscaler",
                        "news_provider":
                        "zscaler"
                    })

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, 'zscaler_news')

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()

                page += 1
            else:
                print("News Not Found")
                loop = False

示例#17

0

显示文件

    def crawler_news(self):
        """
        This function will scrap news page wise for given url
        :return:
        """

        try:
            response = crawler.MakeRequest(self.url,
                                           'Get',
                                           postData=self.body,
                                           headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, self.news_collection)
            news_data = soup.find_all('li', {'class': "NewsPanel__item"})
            if news_data:
                for news in news_data:
                    try:
                        news_dict = Helper.get_news_dict()

                        title_data = news.find('div',
                                               {'class': 'NewsPanel__body'})
                        title = title_data.text.strip() if title_data else ""

                        url_data = news.find('a', {'href': True})
                        url = "https://www.kaneka.co.jp/" + str(
                            url_data['href']) if url_data else ''

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue

                        publish_date_data = news.find(
                            'time', {'class': 'NewsPanel__time'})
                        publish_date = Helper.parse_date(
                            publish_date_data.text
                        ) if publish_date_data else ''

                        url_response = crawler.MakeRequest(
                            url,
                            'Get',
                            postData=self.body,
                            headers=self.headers)
                        url_soup = BeautifulSoup(url_response.content,
                                                 'html.parser')
                        description_data = url_soup.find(
                            'article', {'class': 'articleBody topics__mod'})

                        description = []
                        regex = re.compile(r'[\n\xa0]')
                        for desc in description_data.find_all('p'):
                            description.append(regex.sub("", str(desc.text)))
                        description = ''.join(description)

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "url":
                            url,
                            "link":
                            url,
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest(),
                            "description":
                            description,
                            "text":
                            description,
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "kaneka",
                            "ticker":
                            "kaneka_scrapped",
                            "industry_name":
                            "kaneka",
                            "news_provider":
                            "kaneka"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    except Exception as e:
                        self.logger.error(f"Error Occured : \n", exc_info=True)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()
            else:
                print("All news has been scrapped !!")
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)

示例#18

0

显示文件

    def crawler_news(self):
        loop = True
        page = 1
        while loop:
            response = crawler.MakeRequest(self.url.format(page=page),
                                           'Get',
                                           postData=self.body)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(False, 'jnj_news')
            news_data = soup.find_all('div', {'class': "MediaPromo-title"})
            if news_data:
                for news in news_data:
                    news_dict = Helper.get_news_dict()

                    title_data = news.find('div',
                                           {'class', 'ResponsiveText-text'})
                    title = title_data.text if title_data else ""

                    url_data = news.find('a', {'href': True})
                    url = url_data['href'] if url_data else ''

                    url_response = crawler.MakeRequest(url, 'Get')
                    url_soup_obj = BeautifulSoup(url_response.content,
                                                 'html.parser')
                    url_response_data = url_soup_obj.find(
                        'script', {'type': 'application/ld+json'})
                    url_response_data = json.loads(url_response_data.text)

                    if url_response_data:
                        publish_date = Helper.parse_date(
                            url_response_data['datePublished']
                        ) if 'datePublished' in url_response_data else ''
                        news_provider = url_response_data['publisher'][
                            'name'] if 'publisher' in url_response_data and 'name' in url_response_data[
                                'publisher'] else ''
                        industry_name = news_provider
                        news_dict.update({
                            "news_provider": news_provider,
                            "industry_name": industry_name,
                            "publishedAt": publish_date,
                            'date': publish_date
                        })

                    description_data = url_soup_obj.find(
                        'div', {'class': 'FullBleedLede-dek'})
                    description = description_data.text if description_data else ''

                    news_dict.update({
                        "title": title,
                        "url": url,
                        "formatted_sub_header": title,
                        "description": description,
                        "link": url
                    })

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, 'jnj_well_news')

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()

                page += 1

            else:
                print("News Not Found")
                loop = False

示例#19

0

显示文件

文件： denso.py 项目： Arpitshah139/news_scrapping

    def crawler_news(self):
        response = crawler.MakeRequest(self.url,
                                       'Get',
                                       postData=self.body,
                                       headers=self.headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        bulk_obj = DbOperations.Get_object_for_bulkop(False, 'denso_news')
        news_data_1 = soup.find_all(
            'div', {
                'class':
                "menuBlock01--border menuBlock01--small menuBlock01  menuBlock01--right"
            })
        news_data_2 = soup.find_all(
            'div',
            {'class': "menuBlock01--border menuBlock01--small menuBlock01 "})
        news_data = news_data_1 + news_data_2
        if news_data:
            for news in news_data:
                news_dict = Helper.get_news_dict()

                title_data = news.find('span',
                                       {"class": "menuBlock01__headingText"})
                title = title_data.text.strip() if title_data else ""

                url_data = news.find('a', {'href': True})
                url = "https://www.denso.com" + str(
                    url_data['href']) if url_data else ''

                url_response = crawler.MakeRequest(url,
                                                   'Get',
                                                   postData=self.body,
                                                   headers=self.headers)
                url_response_soup = BeautifulSoup(url_response.content,
                                                  'html.parser')
                description_data = url_response_soup.find(
                    'span', {'class': 'heading01__copy heading01__copy--lead'})
                description = description_data.text.strip(
                ) if description_data else ''

                publish_date_data = news.find('p',
                                              {"class": "menuBlock01__text"})
                publish_date_data.span.decompose()
                publish_date = Helper.parse_date(
                    publish_date_data.text
                ) if publish_date_data and publish_date_data.text != '' else ''

                news_dict.update({
                    "title": title,
                    "url": url,
                    "formatted_sub_header": title,
                    "description": description,
                    "link": url,
                    "publishedAt": publish_date,
                    'date': publish_date,
                    "news_provider": "denso"
                })

                bulk_obj.insert(news_dict)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 100:
                    bulk_obj.execute()
                    bulk_obj = DbOperations.Get_object_for_bulkop(
                        False, 'denso_news')

            if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0:
                bulk_obj.execute()

示例#20

0

显示文件

文件： fujielectric.py 项目： Arpitshah139/news_scrapping

    def crawler_news(self):
        response = crawler.MakeRequest(self.url,
                                       'Get',
                                       postData=self.body,
                                       headers=self.headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        bulk_obj = DbOperations.Get_object_for_bulkop(False,
                                                      'fujielectric_news')
        news_data = soup.find('div', {'id': 'tab_news_release'})
        if news_data:
            for news in news_data.find_all('dt'):
                news_dict = Helper.get_news_dict()

                title_data = news.find_next_sibling().find_next_sibling().a
                title = title_data.text if title_data else ""

                url_data = news.find_next_sibling().find_next_sibling().a
                url = url_data['href'] if url_data else ''

                publish_date_data = news.text if news.text != '' else ''
                publish_date = Helper.parse_date(publish_date_data)

                news_dict.update({
                    "title":
                    title,
                    "news_title_uid":
                    hashlib.md5(title.encode()).hexdigest(),
                    "url":
                    url,
                    "link":
                    url,
                    "news_url_uid":
                    hashlib.md5(url.encode()).hexdigest(),
                    "publishedAt":
                    publish_date,
                    'date':
                    publish_date,
                    "publishedAt_scrapped":
                    publish_date,
                    "company_id":
                    "fujielectric",
                    "ticker":
                    "fujielectric_scrapped",
                    "industry_name":
                    "fujielectric",
                    "news_provider":
                    "fujielectric"
                })

                bulk_obj.insert(news_dict)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 100:
                    bulk_obj.execute()
                    bulk_obj = DbOperations.Get_object_for_bulkop(
                        False, 'fujielectric_news')

            if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0:
                bulk_obj.execute()
        else:
            print("News Not Found")

示例#21

0

显示文件

文件： omron.py 项目： Kashyap10/NewsScrapping

    def crawler_news(self):
        try:
            loop = True
            while loop:
                response = crawler.MakeRequest(self.url,
                                               'Post',
                                               postData=self.body,
                                               headers=self.headers)
                soup = BeautifulSoup(response.content, 'html.parser')
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_data = json.loads(response.content.decode('utf-8'))
                if news_data:
                    for news in news_data[:1]:
                        news_dict = Helper.get_news_dict()

                        title = news['Title'] if 'Title' in news else ''

                        # Check if already present
                        unqUrl = hashlib.md5(title.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_title_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. title - ( " + title + " )")
                            continue

                        publish_date_data = news[
                            'EntryDate'] if 'EntryDate' in news else ''
                        publish_date = Helper.parse_date(publish_date_data)

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "omron",
                            "ticker":
                            "omron_scrapped",
                            "industry_name":
                            "omron",
                            "news_provider":
                            "omron"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()

                    self.body['StartRange'] += 1
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)

示例#22

0

显示文件

文件： crane.py 项目： Kashyap10/NewsScrapping

    def crawler_news(self):
        try:
            response = crawler.MakeRequest(self.url,
                                           'Get',
                                           postData=self.body,
                                           headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, self.news_collection)
            news_data = json.loads(response.content.decode('utf-8'))
            if news_data:
                for news in news_data['GetPressReleaseListResult']:
                    news_dict = Helper.get_news_dict()

                    title = news['Headline'] if 'Headline' in news else ""

                    url = "https://www.craneco.com" + str(
                        news['LinkToDetailPage']
                    ) if 'LinkToDetailPage' in news else ''

                    # Check if already present
                    unqUrl = hashlib.md5(url.encode()).hexdigest()
                    chkIsExists = DbOperations.GetData(
                        self.news_collection, {"news_url_uid": str(unqUrl)},
                        {}, QueryType.one)
                    if (chkIsExists):
                        print("Already saved. url - ( " + url + " )")
                        continue

                    publish_date = Helper.parse_date(
                        news['PressReleaseDate']
                    ) if 'PressReleaseDate' in news else ''

                    url_response = crawler.MakeRequest(url,
                                                       'Get',
                                                       postData=self.body,
                                                       headers=self.headers)
                    url_soup = BeautifulSoup(url_response.content,
                                             'html.parser')
                    description_data = url_soup.find(
                        'div', {'class': "module_body clearfix"})

                    description = []
                    regex = re.compile(r'[\n\xa0]')
                    for desc in description_data.find_all('p'):
                        description.append(regex.sub("", str(desc.text)))
                    description = ''.join(description)

                    news_dict.update({
                        "title":
                        title,
                        "news_title_uid":
                        hashlib.md5(title.encode()).hexdigest(),
                        "url":
                        url,
                        "link":
                        url,
                        "news_url_uid":
                        hashlib.md5(url.encode()).hexdigest(),
                        "description":
                        description,
                        "text":
                        description,
                        "publishedAt":
                        publish_date,
                        'date':
                        publish_date,
                        "publishedAt_scrapped":
                        publish_date,
                        "company_id":
                        "crane",
                        "ticker":
                        "crane_scrapped",
                        "industry_name":
                        "crane",
                        "news_provider":
                        "crane"
                    })

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, self.news_collection)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()
            else:
                print("All news has been scrapped !!")
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)

示例#23

0

显示文件

文件： hanwhacorp.py 项目： Kashyap10/NewsScrapping

    def crawler_news(self):
        try:
            loop = True
            page = 1
            while loop:
                response = crawler.MakeRequest(self.url.format(page=page),
                                               'Get',
                                               postData=self.body,
                                               headers=self.headers)
                soup = BeautifulSoup(response.content, 'html.parser')
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_data = soup.find('tbody')
                if news_data and news_data.tr.text.strip(
                ) != 'There is no data.':
                    for news in news_data.find_all('tr'):
                        news_dict = Helper.get_news_dict()

                        title_data = news.find('td', {'class': 'title'})
                        title = title_data.text if title_data else ""

                        # Check if already present
                        unqUrl = hashlib.md5(title.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_title_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. title - ( " + title + " )")
                            continue

                        publish_date_data = news.find_all('td')[3].text
                        publish_date = Helper.parse_date(
                            publish_date_data
                        ) if publish_date_data and publish_date_data != '' else ''

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "hanwhacorp",
                            "ticker":
                            "hanwhacorp_scrapped",
                            "industry_name":
                            "hanwhacorp",
                            "news_provider":
                            "hanwhacorp"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()

                    page += 1
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)

示例#24

0

显示文件

文件： enersys.py 项目： Arpitshah139/news_scrapping

    def crawler_news(self):
        response = crawler.MakeRequest(self.url,
                                       'Get',
                                       postData=self.body,
                                       headers=self.headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        bulk_obj = DbOperations.Get_object_for_bulkop(False, 'enersys_news')
        news_data = soup.find(
            'div', {'class': "content-listing__items glide__slides"})
        if news_data:
            for news in news_data.find_all(
                    'a',
                {'class': 'content-listing__item glide__slide col-lg-3 '}):
                news_dict = Helper.get_news_dict()
                regex = re.compile(r'[\r\n\xa0]')

                title_data = news.find('h3')
                title = regex.sub("", str(
                    title_data.text.strip())) if title_data else ""

                url = "https://www.enersys.com" + str(news['href'])

                publish_date_data = news.find(
                    'p', {'class': 'content-listing__item-date'})
                publish_date = Helper.parse_date(
                    publish_date_data.text.strip()
                ) if publish_date_data and publish_date_data.text != '' else ''

                url_response = crawler.MakeRequest(url,
                                                   'Get',
                                                   postData=self.body,
                                                   headers=self.headers)
                url_soup = BeautifulSoup(url_response.content, 'html.parser')
                description_data = url_soup.find(
                    'div', {'class': "standard-page__body"})

                description = []

                for desc in description_data.find_all('p'):
                    description.append(regex.sub("", str(desc.text)))
                description = ''.join(description)

                news_dict.update({
                    "title":
                    title,
                    "news_title_uid":
                    hashlib.md5(title.encode()).hexdigest(),
                    "url":
                    url,
                    "link":
                    url,
                    "news_url_uid":
                    hashlib.md5(url.encode()).hexdigest(),
                    "description":
                    description,
                    "text":
                    description,
                    "publishedAt":
                    publish_date,
                    'date':
                    publish_date,
                    "publishedAt_scrapped":
                    publish_date,
                    "company_id":
                    "enersys",
                    "ticker":
                    "enersys_scrapped",
                    "industry_name":
                    "enersys",
                    "news_provider":
                    "enersys"
                })

                bulk_obj.insert(news_dict)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 100:
                    bulk_obj.execute()
                    bulk_obj = DbOperations.Get_object_for_bulkop(
                        False, 'enersys_news')

            if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0:
                bulk_obj.execute()

        else:
            print("News Not Found")

示例#25

0

显示文件

文件： olympus.py 项目： Arpitshah139/news_scrapping

    def crawler_news(self):
        response = crawler.MakeRequest(self.url,
                                       'Get',
                                       postData=self.body,
                                       headers=self.headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        bulk_obj = DbOperations.Get_object_for_bulkop(False, 'olympus_news')
        news_data = soup.find('ul', {'class': "list-news-01"})
        if news_data:
            for news in news_data.find_all('li'):
                news_dict = Helper.get_news_dict()

                title_data = news.find('span', {'class': 'text'})
                title = title_data.text if title_data else ""

                url_data = news.find('a', {'href': True})
                url = "https://www.olympus-global.com" + str(
                    url_data['href']) if url_data else ''

                publish_date_data = news.find('span', {'class': 'date'})
                publish_date = Helper.parse_date(
                    publish_date_data.text
                ) if publish_date_data and publish_date_data.text != '' else ''

                url_response = crawler.MakeRequest(url,
                                                   'Get',
                                                   postData=self.body,
                                                   headers=self.headers)
                url_soup = BeautifulSoup(url_response.content, 'html.parser')
                description_data = url_soup.find('div',
                                                 {'class': "area-content"})

                description = []
                regex = re.compile(r'[\n\xa0]')
                for desc in description_data.find_all('b'):
                    description.append(regex.sub("", str(desc.text.strip())))
                description = ''.join(description)

                news_dict.update({
                    "title":
                    title,
                    "news_title_uid":
                    hashlib.md5(title.encode()).hexdigest(),
                    "url":
                    url,
                    "link":
                    url,
                    "news_url_uid":
                    hashlib.md5(url.encode()).hexdigest(),
                    "description":
                    description,
                    "text":
                    description,
                    "publishedAt":
                    publish_date,
                    'date':
                    publish_date,
                    "publishedAt_scrapped":
                    publish_date,
                    "company_id":
                    "olympus",
                    "ticker":
                    "olympus_scrapped",
                    "industry_name":
                    "olympus",
                    "news_provider":
                    "olympus"
                })

                bulk_obj.insert(news_dict)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 100:
                    bulk_obj.execute()
                    bulk_obj = DbOperations.Get_object_for_bulkop(
                        False, 'olympus_news')

            if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']) > 0:
                bulk_obj.execute()
        else:
            print("News Not Found")

示例#26

0

显示文件

文件： voestalpine.py 项目： Kashyap10/NewsScrapping

    def crawler_news(self):
        try:
            response = crawler.MakeRequest(self.url,
                                           'Get',
                                           postData=self.body,
                                           headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, self.news_collection)
            article_data = soup.find_all('div', {'class': 'article'})
            if article_data:
                for article in article_data:
                    news_data = article.find_all('section', {'class': ""})
                    for news in news_data[1::2]:

                        news_dict = Helper.get_news_dict()

                        title_data = news.find('h2')
                        title = title_data.text if title_data else ""

                        url = news.find_next_sibling(
                        ).a['href'] if news.find_next_sibling() else ''

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue

                        description_data = news.find('p')
                        description = description_data.text if description_data else ''

                        url_response = crawler.MakeRequest(
                            url,
                            'Get',
                            postData=self.body,
                            headers=self.headers)
                        url_soup = BeautifulSoup(url_response.content,
                                                 'html.parser')
                        publish_date_data = url_soup.find(
                            'p', {'class': 'meta large inline'})
                        publish_date = Helper.parse_date(
                            publish_date_data.text.replace('|', "").strip()
                        ) if publish_date_data and publish_date_data.text != '' else ''

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "url":
                            url,
                            "link":
                            url,
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest(),
                            "description":
                            description,
                            "text":
                            description,
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "voestalpine",
                            "ticker":
                            "voestalpine_scrapped",
                            "industry_name":
                            "voestalpine",
                            "news_provider":
                            "voestalpine"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)

示例#27

0

显示文件

    def crawler_news(self):
        try:
            loop = True
            page = 0
            while loop:
                response = crawler.MakeRequest(
                    self.url,
                    'Post',
                    postData=self.body.format(page=page),
                    headers=self.headers)
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_data = json.loads(response.content.decode('utf-8'))
                if news_data['News']:
                    for news in news_data['News']:
                        news_dict = Helper.get_news_dict()

                        title = news['Title'] if 'Title' in news else ''

                        url = "https://www.novozymes.com" + str(
                            news['Url']) if 'Url' in news else ''

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue

                        publish_date_data = news[
                            'CreationDate'] if 'CreationDate' in news else ''
                        publish_date = Helper.parse_date(publish_date_data)

                        description = news[
                            'Content'] if 'Content' in news else ''

                        news_dict.update({
                            "title":
                            title,
                            "news_title_uid":
                            hashlib.md5(title.encode()).hexdigest(),
                            "url":
                            url,
                            "link":
                            url,
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest(),
                            "description":
                            description,
                            "text":
                            description,
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "publishedAt_scrapped":
                            publish_date,
                            "company_id":
                            "novozymes",
                            "ticker":
                            "novozymes_scrapped",
                            "industry_name":
                            "novozymes",
                            "news_provider":
                            "novozymes"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()

                    page += 1
                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)

示例#28

0

显示文件

    def crawler_news(self):
        try:
            loop = True
            page = 1
            while loop:
                response = crawler.MakeRequest(self.url.format(page=page),
                                               'Get',
                                               postData=self.body)
                soup = BeautifulSoup(response.content, 'html.parser')
                bulk_obj = DbOperations.Get_object_for_bulkop(
                    False, self.news_collection)
                news_data = soup.find_all('div',
                                          {'class': "contentCollection--item"})
                if news_data:
                    for news in news_data:
                        news_dict = Helper.get_news_dict()

                        title_data = news.find('a')
                        title = title_data.text if title_data else ""

                        url_data = news.find('a', {'href': True})
                        url = "https://corporate.exxonmobil.com" + str(
                            url_data['href']) if url_data else ''

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue

                        description_data = news.find(
                            'span',
                            {'class': 'contentCollection--description p'})
                        description = description_data.text if description_data else ''

                        publish_date_data = news.find('span',
                                                      {'class': 'date'})
                        publish_date = Helper.parse_date(
                            publish_date_data.text
                        ) if publish_date_data and publish_date_data.text != '' else ''

                        news_dict.update({
                            "title":
                            title,
                            "url":
                            url,
                            "formatted_sub_header":
                            title,
                            "description":
                            description,
                            "link":
                            url,
                            "publishedAt":
                            publish_date,
                            'date':
                            publish_date,
                            "news_provider":
                            "exxonmobil_corporation",
                            "company_id":
                            "exxonmobil_corporation",
                            "ticker":
                            "exxonmobil_corporation_scrapped",
                            "industry_name":
                            "exxonmobil_corporation"
                        })

                        bulk_obj.insert(news_dict)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 100:
                            bulk_obj.execute()
                            bulk_obj = DbOperations.Get_object_for_bulkop(
                                False, self.news_collection)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 0:
                        bulk_obj.execute()

                    page += 1

                else:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)

示例#29

0

显示文件

    def crawler_news(self):
        try:
            response = crawler.MakeRequest(self.url,
                                           'Get',
                                           postData=self.body,
                                           headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            bulk_obj = DbOperations.Get_object_for_bulkop(
                False, self.news_collection)
            news_data = soup.find('ul', {'class': "newsPressList tabContent"})
            if news_data:
                for news in news_data.find_all('li'):

                    news_dict = Helper.get_news_dict()

                    title_data = news.find('h3')
                    title = title_data.text if title_data else ""

                    url_data = news.find('a', {'href': True})
                    url = url_data['href'] if url_data else ''

                    # Check if already present
                    unqUrl = hashlib.md5(url.encode()).hexdigest()
                    chkIsExists = DbOperations.GetData(
                        self.news_collection, {"news_url_uid": str(unqUrl)},
                        {}, QueryType.one)
                    if (chkIsExists):
                        print("Already saved. url - ( " + url + " )")
                        continue

                    publish_date_data = news.find('span', {'class': 'date'})
                    publish_date = Helper.parse_date(
                        publish_date_data.text
                    ) if publish_date_data and publish_date_data.text != '' else ''

                    news_dict.update({
                        "title":
                        title,
                        "news_title_uid":
                        hashlib.md5(title.encode()).hexdigest(),
                        "url":
                        url,
                        "link":
                        url,
                        "news_url_uid":
                        hashlib.md5(url.encode()).hexdigest(),
                        "publishedAt":
                        publish_date,
                        'date':
                        publish_date,
                        "publishedAt_scrapped":
                        publish_date,
                        "company_id":
                        "LnT",
                        "ticker":
                        "LnT_scrapped",
                        "industry_name":
                        "LnT",
                        "news_provider":
                        "LnT"
                    })

                    bulk_obj.insert(news_dict)

                    if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                           ) > 100:
                        bulk_obj.execute()
                        bulk_obj = DbOperations.Get_object_for_bulkop(
                            False, self.news_collection)

                if len(bulk_obj._BulkOperationBuilder__bulk.__dict__['ops']
                       ) > 0:
                    bulk_obj.execute()
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)

示例#30

0

显示文件

文件： hbfuller.py 项目： Kashyap10/NewsScrapping

    def crawler_news(self):
        try:
            loop = True
            page = 1
            while loop:
                try:
                    response = crawler.MakeRequest(self.url.format(page=page),
                                                   'Get',
                                                   postData=self.body,
                                                   headers=self.headers)
                    soup = BeautifulSoup(response.content, 'html.parser')
                    bulk_obj = DbOperations.Get_object_for_bulkop(
                        False, self.news_collection)
                    news_data = soup.find_all('div', {'class': "media"})
                    if news_data:
                        for news in news_data:
                            news_dict = Helper.get_news_dict()

                            title_data = news.find('h4',
                                                   {'class': 'media-heading'})
                            title = title_data.text if title_data else ""

                            url_data = news.find('a', {'href': True})
                            url = "https://www.hbfuller.com" + str(
                                url_data['href']) if url_data else ''

                            # Check if already present
                            unqUrl = hashlib.md5(url.encode()).hexdigest()
                            chkIsExists = DbOperations.GetData(
                                self.news_collection,
                                {"news_url_uid": str(unqUrl)}, {},
                                QueryType.one)
                            if (chkIsExists):
                                print("Already saved. url - ( " + url + " )")
                                continue

                            publish_date_data = news.find(
                                'div', {'class': 'listing-date'})
                            publish_date = Helper.parse_date(
                                str(publish_date_data.text).strip()
                            ) if publish_date_data and publish_date_data.text != '' else ''

                            url_response = crawler.MakeRequest(
                                url,
                                'Get',
                                postData=self.body,
                                headers=self.headers)
                            url_soup = BeautifulSoup(url_response.content,
                                                     'html.parser')
                            description_data = url_soup.find(
                                'div', {
                                    'class': 'row ar-body'
                                }).find('div', {
                                    'class': "col-xs-12 col-sm-8 col-md-9"
                                }).find('div', {
                                    'class': 'col-sm-12'
                                }).find('div', {'style': ''})
                            description = description_data.text.strip().split(
                                '\n')
                            description = ''.join(description[1:])

                            news_dict.update({
                                "title":
                                title,
                                "news_title_uid":
                                hashlib.md5(title.encode()).hexdigest(),
                                "url":
                                url,
                                "link":
                                url,
                                "news_url_uid":
                                hashlib.md5(url.encode()).hexdigest(),
                                "description":
                                description,
                                "text":
                                description,
                                "publishedAt":
                                publish_date,
                                'date':
                                publish_date,
                                "publishedAt_scrapped":
                                publish_date,
                                "company_id":
                                "hbfuller",
                                "ticker":
                                "hbfuller_scrapped",
                                "industry_name":
                                "hbfuller",
                                "news_provider":
                                "hbfuller"
                            })

                            bulk_obj.insert(news_dict)

                            if len(bulk_obj._BulkOperationBuilder__bulk.
                                   __dict__['ops']) > 100:
                                bulk_obj.execute()
                                bulk_obj = DbOperations.Get_object_for_bulkop(
                                    False, self.news_collection)

                        if len(bulk_obj._BulkOperationBuilder__bulk.
                               __dict__['ops']) > 0:
                            bulk_obj.execute()

                        page += 1
                    else:
                        print("All news has been scrapped !!")
                        loop = False
                except AttributeError as e:
                    print("All news has been scrapped !!")
                    loop = False
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)