Пример #1
0
 def post_to_twitter(self):
     """Posting to Twitter."""
     tw_logger = AppLogger("twitter_api")
     auth = tweepy.OAuthHandler(settings.TW_CONSUMER_KEY,
                                settings.TW_CONSUMER_SECRET)
     auth.set_access_token(settings.TW_ACCESS_TOKEN,
                           settings.TW_ACCESS_TOKEN_SECRET)
     api = tweepy.API(auth)
     try:
         api.verify_credentials()
     except Exception as e:
         message = "Error creating API."
         tw_logger.write_error(message)
         return ""
     result = api.update_with_media(filename=self.img_path,
                                    status=self.img_caption)
Пример #2
0
class SourceParser:
    """Main class for Znaj site parser."""

    src = "https://znaj.ua/feed/rss2.xml"

    def __init__(self):
        """Init main class."""
        self.logger = AppLogger("parser_znaj")

    def get_news(self):
        """Get new posts from current source."""
        d = feedparser.parse(self.src)
        news = []
        for n in d.entries:
            tz = pytz.timezone("UTC")
            date = datetime(n.published_parsed[0], n.published_parsed[1],
                            n.published_parsed[2], n.published_parsed[3],
                            n.published_parsed[4], n.published_parsed[5], 0,
                            tz)
            text = ""
            if not isinstance(n.content, list) and len(n.content) < 1:
                logger_message = "Can not find content for URL {}".format(
                    n.link)
                self.logger.write_error(logger_message)
                break
            text_div = BeautifulSoup(n.content[0].value, "html.parser")
            if text_div:
                for script in text_div(["script", "style"]):
                    script.decompose()
                text = text_div.get_text().strip()
                text = os.linesep.join([s for s in text.splitlines() if s])
            else:
                logger_message = "Can not find text block for URL {}".format(
                    link)
                self.logger.write_error(logger_message)
            news.append({
                "title": n.title,
                "link": n.link,
                "date": date,
                "text": text
            })
        return news
Пример #3
0
class SourceParser:
    """Main class for UA Pravda site parser."""

    src = "https://www.pravda.com.ua/rss/view_news/"

    def __init__(self):
        """Init main class."""
        self.logger = AppLogger("parser_uapravda")

    def get_news(self):
        """Get new posts from current source."""
        d = feedparser.parse(self.src)
        news = []
        for n in d.entries:
            if not hasattr(n, 'link'):
                continue
            tz = pytz.timezone("UTC")
            text = self.get_news_text(n.link)
            date = datetime(n.published_parsed[0], n.published_parsed[1],
                            n.published_parsed[2], n.published_parsed[3],
                            n.published_parsed[4], n.published_parsed[5], 0,
                            tz)
            news.append({
                "title": n.title,
                "link": n.link,
                "date": date,
                "text": text
            })
        return news

    def get_news_text(self, link):
        """Get news text by provided link."""
        text = ""
        try:
            with ur.urlopen(link) as response:
                soup = BeautifulSoup(response.read(),
                                     "html.parser",
                                     from_encoding="cp1251")
                text_div = soup.find("div", class_="post_news__text")
                if not text_div:
                    text_div = soup.find("div", class_="post__text")
                if not text_div:
                    text_div = soup.find("div", class_="post_text")
                if not text_div:
                    text_div = soup.find("article", class_="article")
                if text_div:
                    for script in text_div(["script", "style"]):
                        script.decompose()
                    text = text_div.get_text()
                else:
                    message = "Can not find text block for URL {}".format(link)
                    self.logger.write_error(message)
        except URLError as e:
            message = "URLError with reason {}".format(e.reason)
            self.logger.write_error(message)
        except HTTPError as e:
            message = "HTTPError with code {} and reason {}".format(
                e.code, e.reason)
            self.logger.write_error(message)
        return text
Пример #4
0
class SourceParser:
    """Main class for 24 Channel site parser."""

    src = "https://24tv.ua/rss/all.xml"

    def __init__(self):
        """Init main class."""
        self.logger = AppLogger("parser_ua24")

    def get_news(self):
        """Get new posts from current source."""
        d = feedparser.parse(self.src)
        news = []
        for n in d.entries:
            tz = pytz.timezone("UTC")
            date = datetime(n.published_parsed[0], n.published_parsed[1],
                            n.published_parsed[2], n.published_parsed[3],
                            n.published_parsed[4], n.published_parsed[5], 0,
                            tz)
            text = ""
            soup = BeautifulSoup(n.description, "html.parser")
            text_div = soup.find("body")
            if text_div:
                for script in text_div(["script", "style"]):
                    script.decompose()
                text = text_div.get_text().strip()
                text = os.linesep.join([s for s in text.splitlines() if s])
            else:
                logger_message = "Can not find description for {}".format(
                    n.link)
                self.logger.write_error(logger_message)
            news.append({
                "title": n.title,
                "link": n.link,
                "date": date,
                "text": text
            })
        return news
Пример #5
0
class MainParser:
    """Main class for all prasers."""

    sources = []

    def run(self):
        """Init main class."""
        self.logger = AppLogger("parser_main")
        self.get_sources()
        self.get_news()

    def get_sources(self):
        """Get sources to parse."""
        from models.models import NewsSource
        self.sources = NewsSource.objects.all()

    def get_news(self):
        """Load module for parsing and scrape news."""
        from models.models import NewsMessage
        for s in self.sources:
            import_module = "parsers.{}".format(s.parser)
            try:
                parser_mod = importlib.import_module(import_module)
            except ImportError:
                logger_message = "Can not import module {}".format(
                    import_module)
                self.logger.write_error(logger_message)
                parser_mod = False
            if not parser_mod or parser_mod.__name__ == "parsers.default":
                continue
            parser = parser_mod.SourceParser()
            news = parser.get_news()
            if len(news) < 1:
                logger_message = "Got 0 news for parser {}.".format(s.parser)
                self.logger.write_error(logger_message)
            for n in news:
                try:
                    obj, created = NewsMessage.objects.get_or_create(
                        link=n["link"], source=s)
                    if created:
                        obj.title = n["title"]
                        obj.text = n["text"]
                        obj.date = n["date"]
                        obj.save()
                except MultipleObjectsReturned:
                    objs = NewsMessage.objects.filter(link=n["link"], source=s)
                    for o in objs[1:]:
                        o.delete()
            now = timezone.now()
            s.last_parsed = now
            s.save()
            logger_message = "Finished for {} on {}".format(s.name, now)
            self.logger.write_info(logger_message)
Пример #6
0
 def __init__(self):
     """Init main class."""
     self.logger = AppLogger("parser_rbc")
Пример #7
0
 def run(self):
     """Init main class."""
     self.logger = AppLogger("parser_main")
     self.get_sources()
     self.get_news()
Пример #8
0
 def __init__(self):
     """Init main class."""
     self.logger = AppLogger("parser_segodnya")
Пример #9
0
class SourceParser:
    """Main class for Segodnya site parser."""

    src = "https://www.segodnya.ua/data/last_news_uk.json"

    def __init__(self):
        """Init main class."""
        self.logger = AppLogger("parser_segodnya")

    def get_news(self):
        """Get new posts from current source."""
        json_data = []
        news = []
        try:
            req = ur.Request(self.src,
                             data=None,
                             headers={'User-Agent': settings.USER_AGENT})
            with ur.urlopen(req) as response:
                encoding = response.info().get_content_charset("utf-8")
                data = response.read()
                json_data = json.loads(data.decode(encoding))
        except URLError as e:
            message = "URLError with reason {}".format(e.reason)
            self.logger.write_error(message)
        except HTTPError as e:
            message = "HTTPError with code {} and reason {}".format(
                e.code, e.reason)
            self.logger.write_error(message)
        for n in json_data:
            link = n["path"]
            if "/uk/" in link:
                link = link.replace("/uk/", "/ua/")
            text = self.get_news_text(link)
            tz = pytz.timezone("UTC")
            date = datetime.fromtimestamp(n["timestamp"], tz=tz)
            news.append({
                "title": n["title"],
                "link": link,
                "date": date,
                "text": text
            })
        return news

    def get_news_text(self, link):
        """Get news text by provided link."""
        text = ""
        class_to_remove = [
            "content-more-links", "content-social", "article__share",
            "article__facebook", "article__footer",
            "article__banner-container", "article__time", "article__header",
            "article__author", "article__adml", "banner-img", "content-tags",
            "content-source", "content-footer-ads", "content-comments",
            "framebox", "content-meta"
        ]
        try:
            req = ur.Request(link,
                             data=None,
                             headers={'User-Agent': settings.USER_AGENT})
            with ur.urlopen(req) as response:
                soup = BeautifulSoup(response.read(), "html.parser")
                if 'Project Shield Logo' in str(soup):
                    return ''
                text_div = soup.find("div", class_="article__body")
                if not text_div:
                    text_div = soup.find("div", class_="article-content")
                    if text_div:
                        text_div = text_div.find("div", class_="col-lg-8")
                if not text_div:
                    text_div = soup.find("article", class_="article__content")
                if not text_div:
                    text_div = soup.find("div", class_="article__content")
                if text_div:
                    for cls in class_to_remove:
                        for div in text_div.find_all("div", {"class": cls}):
                            div.decompose()
                    for script in text_div(["script", "style"]):
                        script.decompose()
                    text = text_div.get_text().strip()
                else:
                    message = "Can not find text block for URL {}".format(link)
                    self.logger.write_error(message)
        except URLError as e:
            message = "URLError with reason {}, URL: {}".format(e.reason, link)
            self.logger.write_error(message)
        except HTTPError as e:
            message = "HTTPError with code {} and reason {}".format(
                e.code, e.reason)
            self.logger.write_error(message)
        return text
Пример #10
0
 def __init__(self):
     """Init main class."""
     self.logger = AppLogger("parser_uapravda")
Пример #11
0
 def __init__(self):
     """Init main class."""
     self.logger = AppLogger("parser_obozrevatel")
Пример #12
0
class SourceParser:
    """Main class for Obozrevatel site parser."""

    src = "https://www.obozrevatel.com/ukr/rss.xml"
    _headers = {
        "X-Requested-With":
        "XMLHttpRequest",
        "Content-Type":
        "application/x-www-form-urlencoded; charset=UTF-8",
        "Cookie":
        "__cfduid=de9f8c28a02694a251da7cc160168a5941612981500; oboz_GDPRManager_userAnswer=eyJpc0FncmVlQW5hbHl0aWNzIjp0cnVlLCJpc0FncmVlRnVuY3Rpb25hbCI6dHJ1ZSwiaXNBZ3JlZU1hcmtldGluZyI6dHJ1ZSwiZGF0ZSI6IjIwMjEtMDItMTFUMTY6Mjc6NTMuOTA2WiJ9; oboz_GDPRManager_userAnswer_isAgreeMarketing=true; oboz_GDPRManager_userAnswer_isAgreeAnalytics=true; __tbc=%7Bjzx%7DRKlxRc9cNeAf6mVFg4i1HNS8SHf802M-USBlzHooNvCE-ARlcOqUuVhx3HvN1X_L2rwonuDDCSplqR4oimN_qyD5b4WocRDmMsCPdrMHMdE1YcA6INNNDOzoBdXNOAK7HvInHedgafemchY1LVQWow; __pat=7200000; __pvi=%7B%22id%22%3A%22v-2021-02-11-18-27-52-756-RH5WKNU7FJMCvtz0-0d5cccbfff50e93a481214d604ab1fe5%22%2C%22domain%22%3A%22.obozrevatel.com%22%2C%22time%22%3A1613061686993%7D; xbc=%7Bjzx%7DXhnnCWafjZ-AmCq53Rdjld_iM8ZmXTAUHVy6gpLGBeyXmiVLOzn9vnbb0kat0KKoEM43jIhN5of33u4X03YhRRHW3gbslpO2rDxvGL8YwCcVREHQaWm3DoF5ISCL8ehA8ny3OG-0GSsb5a3E6lxqyQ; pnespsdk_visitor=wz2hkfl2zz5sne06; pnespsdk_ssn=%7B%22%24s%22%3A1613060708121%2C%22visitNumber%22%3A3%7D",
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"
    }

    def __init__(self):
        """Init main class."""
        self.logger = AppLogger("parser_obozrevatel")

    def get_news(self):
        """Get new posts from current source."""
        d = feedparser.parse(self.src)
        news = []
        for n in d.entries:
            tz = pytz.timezone("UTC")
            text = self.get_news_text(n.link)
            date = datetime(n.published_parsed[0], n.published_parsed[1],
                            n.published_parsed[2], n.published_parsed[3],
                            n.published_parsed[4], n.published_parsed[5], 0,
                            tz)
            news.append({
                "title": n.title,
                "link": n.link,
                "date": date,
                "text": text
            })
        return news

    def get_news_text(self, link):
        """Get news text by provided link."""
        text = ""
        class_to_remove = ["footnote"]
        request = ur.Request(link, None, self._headers,
                             "https://obozrevatel.com")
        try:
            with ur.urlopen(request) as response:
                soup = BeautifulSoup(response.read(),
                                     "html.parser",
                                     from_encoding="cp1251")
                text_div = soup.find("div", class_="newsFull_text")
                if not text_div:
                    text_div = soup.find("div", class_="news-video-full__text")
                if not text_div:
                    text_div = soup.find("div", class_="news-full__text")
                if not text_div:
                    text_div = soup.find("div", class_="newsItem_fullText")
                if text_div:
                    for cls in class_to_remove:
                        for div in text_div.find_all("div", {"class": cls}):
                            div.decompose()
                    for script in text_div(["script", "style"]):
                        script.decompose()
                    text = text_div.get_text()
                    if soup.original_encoding == 'cp1251':
                        try:
                            text = text.encode('cp1251').decode('utf-8')
                        except UnicodeDecodeError:
                            pass
                else:
                    message = "Can not find text block for URL {}".format(link)
                    self.logger.write_error(message)
        except URLError as e:
            message = "URLError with reason {}".format(e.reason)
            self.logger.write_error(message)
        except HTTPError as e:
            message = "HTTPError with code {} and reason {}".format(
                e.code, e.reason)
            self.logger.write_error(message)
        return text