def post_to_twitter(self): """Posting to Twitter.""" tw_logger = AppLogger("twitter_api") auth = tweepy.OAuthHandler(settings.TW_CONSUMER_KEY, settings.TW_CONSUMER_SECRET) auth.set_access_token(settings.TW_ACCESS_TOKEN, settings.TW_ACCESS_TOKEN_SECRET) api = tweepy.API(auth) try: api.verify_credentials() except Exception as e: message = "Error creating API." tw_logger.write_error(message) return "" result = api.update_with_media(filename=self.img_path, status=self.img_caption)
class SourceParser: """Main class for Znaj site parser.""" src = "https://znaj.ua/feed/rss2.xml" def __init__(self): """Init main class.""" self.logger = AppLogger("parser_znaj") def get_news(self): """Get new posts from current source.""" d = feedparser.parse(self.src) news = [] for n in d.entries: tz = pytz.timezone("UTC") date = datetime(n.published_parsed[0], n.published_parsed[1], n.published_parsed[2], n.published_parsed[3], n.published_parsed[4], n.published_parsed[5], 0, tz) text = "" if not isinstance(n.content, list) and len(n.content) < 1: logger_message = "Can not find content for URL {}".format( n.link) self.logger.write_error(logger_message) break text_div = BeautifulSoup(n.content[0].value, "html.parser") if text_div: for script in text_div(["script", "style"]): script.decompose() text = text_div.get_text().strip() text = os.linesep.join([s for s in text.splitlines() if s]) else: logger_message = "Can not find text block for URL {}".format( link) self.logger.write_error(logger_message) news.append({ "title": n.title, "link": n.link, "date": date, "text": text }) return news
class SourceParser: """Main class for UA Pravda site parser.""" src = "https://www.pravda.com.ua/rss/view_news/" def __init__(self): """Init main class.""" self.logger = AppLogger("parser_uapravda") def get_news(self): """Get new posts from current source.""" d = feedparser.parse(self.src) news = [] for n in d.entries: if not hasattr(n, 'link'): continue tz = pytz.timezone("UTC") text = self.get_news_text(n.link) date = datetime(n.published_parsed[0], n.published_parsed[1], n.published_parsed[2], n.published_parsed[3], n.published_parsed[4], n.published_parsed[5], 0, tz) news.append({ "title": n.title, "link": n.link, "date": date, "text": text }) return news def get_news_text(self, link): """Get news text by provided link.""" text = "" try: with ur.urlopen(link) as response: soup = BeautifulSoup(response.read(), "html.parser", from_encoding="cp1251") text_div = soup.find("div", class_="post_news__text") if not text_div: text_div = soup.find("div", class_="post__text") if not text_div: text_div = soup.find("div", class_="post_text") if not text_div: text_div = soup.find("article", class_="article") if text_div: for script in text_div(["script", "style"]): script.decompose() text = text_div.get_text() else: message = "Can not find text block for URL {}".format(link) self.logger.write_error(message) except URLError as e: message = "URLError with reason {}".format(e.reason) self.logger.write_error(message) except HTTPError as e: message = "HTTPError with code {} and reason {}".format( e.code, e.reason) self.logger.write_error(message) return text
class SourceParser: """Main class for 24 Channel site parser.""" src = "https://24tv.ua/rss/all.xml" def __init__(self): """Init main class.""" self.logger = AppLogger("parser_ua24") def get_news(self): """Get new posts from current source.""" d = feedparser.parse(self.src) news = [] for n in d.entries: tz = pytz.timezone("UTC") date = datetime(n.published_parsed[0], n.published_parsed[1], n.published_parsed[2], n.published_parsed[3], n.published_parsed[4], n.published_parsed[5], 0, tz) text = "" soup = BeautifulSoup(n.description, "html.parser") text_div = soup.find("body") if text_div: for script in text_div(["script", "style"]): script.decompose() text = text_div.get_text().strip() text = os.linesep.join([s for s in text.splitlines() if s]) else: logger_message = "Can not find description for {}".format( n.link) self.logger.write_error(logger_message) news.append({ "title": n.title, "link": n.link, "date": date, "text": text }) return news
class MainParser: """Main class for all prasers.""" sources = [] def run(self): """Init main class.""" self.logger = AppLogger("parser_main") self.get_sources() self.get_news() def get_sources(self): """Get sources to parse.""" from models.models import NewsSource self.sources = NewsSource.objects.all() def get_news(self): """Load module for parsing and scrape news.""" from models.models import NewsMessage for s in self.sources: import_module = "parsers.{}".format(s.parser) try: parser_mod = importlib.import_module(import_module) except ImportError: logger_message = "Can not import module {}".format( import_module) self.logger.write_error(logger_message) parser_mod = False if not parser_mod or parser_mod.__name__ == "parsers.default": continue parser = parser_mod.SourceParser() news = parser.get_news() if len(news) < 1: logger_message = "Got 0 news for parser {}.".format(s.parser) self.logger.write_error(logger_message) for n in news: try: obj, created = NewsMessage.objects.get_or_create( link=n["link"], source=s) if created: obj.title = n["title"] obj.text = n["text"] obj.date = n["date"] obj.save() except MultipleObjectsReturned: objs = NewsMessage.objects.filter(link=n["link"], source=s) for o in objs[1:]: o.delete() now = timezone.now() s.last_parsed = now s.save() logger_message = "Finished for {} on {}".format(s.name, now) self.logger.write_info(logger_message)
def __init__(self): """Init main class.""" self.logger = AppLogger("parser_rbc")
def run(self): """Init main class.""" self.logger = AppLogger("parser_main") self.get_sources() self.get_news()
def __init__(self): """Init main class.""" self.logger = AppLogger("parser_segodnya")
class SourceParser: """Main class for Segodnya site parser.""" src = "https://www.segodnya.ua/data/last_news_uk.json" def __init__(self): """Init main class.""" self.logger = AppLogger("parser_segodnya") def get_news(self): """Get new posts from current source.""" json_data = [] news = [] try: req = ur.Request(self.src, data=None, headers={'User-Agent': settings.USER_AGENT}) with ur.urlopen(req) as response: encoding = response.info().get_content_charset("utf-8") data = response.read() json_data = json.loads(data.decode(encoding)) except URLError as e: message = "URLError with reason {}".format(e.reason) self.logger.write_error(message) except HTTPError as e: message = "HTTPError with code {} and reason {}".format( e.code, e.reason) self.logger.write_error(message) for n in json_data: link = n["path"] if "/uk/" in link: link = link.replace("/uk/", "/ua/") text = self.get_news_text(link) tz = pytz.timezone("UTC") date = datetime.fromtimestamp(n["timestamp"], tz=tz) news.append({ "title": n["title"], "link": link, "date": date, "text": text }) return news def get_news_text(self, link): """Get news text by provided link.""" text = "" class_to_remove = [ "content-more-links", "content-social", "article__share", "article__facebook", "article__footer", "article__banner-container", "article__time", "article__header", "article__author", "article__adml", "banner-img", "content-tags", "content-source", "content-footer-ads", "content-comments", "framebox", "content-meta" ] try: req = ur.Request(link, data=None, headers={'User-Agent': settings.USER_AGENT}) with ur.urlopen(req) as response: soup = BeautifulSoup(response.read(), "html.parser") if 'Project Shield Logo' in str(soup): return '' text_div = soup.find("div", class_="article__body") if not text_div: text_div = soup.find("div", class_="article-content") if text_div: text_div = text_div.find("div", class_="col-lg-8") if not text_div: text_div = soup.find("article", class_="article__content") if not text_div: text_div = soup.find("div", class_="article__content") if text_div: for cls in class_to_remove: for div in text_div.find_all("div", {"class": cls}): div.decompose() for script in text_div(["script", "style"]): script.decompose() text = text_div.get_text().strip() else: message = "Can not find text block for URL {}".format(link) self.logger.write_error(message) except URLError as e: message = "URLError with reason {}, URL: {}".format(e.reason, link) self.logger.write_error(message) except HTTPError as e: message = "HTTPError with code {} and reason {}".format( e.code, e.reason) self.logger.write_error(message) return text
def __init__(self): """Init main class.""" self.logger = AppLogger("parser_uapravda")
def __init__(self): """Init main class.""" self.logger = AppLogger("parser_obozrevatel")
class SourceParser: """Main class for Obozrevatel site parser.""" src = "https://www.obozrevatel.com/ukr/rss.xml" _headers = { "X-Requested-With": "XMLHttpRequest", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Cookie": "__cfduid=de9f8c28a02694a251da7cc160168a5941612981500; oboz_GDPRManager_userAnswer=eyJpc0FncmVlQW5hbHl0aWNzIjp0cnVlLCJpc0FncmVlRnVuY3Rpb25hbCI6dHJ1ZSwiaXNBZ3JlZU1hcmtldGluZyI6dHJ1ZSwiZGF0ZSI6IjIwMjEtMDItMTFUMTY6Mjc6NTMuOTA2WiJ9; oboz_GDPRManager_userAnswer_isAgreeMarketing=true; oboz_GDPRManager_userAnswer_isAgreeAnalytics=true; __tbc=%7Bjzx%7DRKlxRc9cNeAf6mVFg4i1HNS8SHf802M-USBlzHooNvCE-ARlcOqUuVhx3HvN1X_L2rwonuDDCSplqR4oimN_qyD5b4WocRDmMsCPdrMHMdE1YcA6INNNDOzoBdXNOAK7HvInHedgafemchY1LVQWow; __pat=7200000; __pvi=%7B%22id%22%3A%22v-2021-02-11-18-27-52-756-RH5WKNU7FJMCvtz0-0d5cccbfff50e93a481214d604ab1fe5%22%2C%22domain%22%3A%22.obozrevatel.com%22%2C%22time%22%3A1613061686993%7D; xbc=%7Bjzx%7DXhnnCWafjZ-AmCq53Rdjld_iM8ZmXTAUHVy6gpLGBeyXmiVLOzn9vnbb0kat0KKoEM43jIhN5of33u4X03YhRRHW3gbslpO2rDxvGL8YwCcVREHQaWm3DoF5ISCL8ehA8ny3OG-0GSsb5a3E6lxqyQ; pnespsdk_visitor=wz2hkfl2zz5sne06; pnespsdk_ssn=%7B%22%24s%22%3A1613060708121%2C%22visitNumber%22%3A3%7D", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36" } def __init__(self): """Init main class.""" self.logger = AppLogger("parser_obozrevatel") def get_news(self): """Get new posts from current source.""" d = feedparser.parse(self.src) news = [] for n in d.entries: tz = pytz.timezone("UTC") text = self.get_news_text(n.link) date = datetime(n.published_parsed[0], n.published_parsed[1], n.published_parsed[2], n.published_parsed[3], n.published_parsed[4], n.published_parsed[5], 0, tz) news.append({ "title": n.title, "link": n.link, "date": date, "text": text }) return news def get_news_text(self, link): """Get news text by provided link.""" text = "" class_to_remove = ["footnote"] request = ur.Request(link, None, self._headers, "https://obozrevatel.com") try: with ur.urlopen(request) as response: soup = BeautifulSoup(response.read(), "html.parser", from_encoding="cp1251") text_div = soup.find("div", class_="newsFull_text") if not text_div: text_div = soup.find("div", class_="news-video-full__text") if not text_div: text_div = soup.find("div", class_="news-full__text") if not text_div: text_div = soup.find("div", class_="newsItem_fullText") if text_div: for cls in class_to_remove: for div in text_div.find_all("div", {"class": cls}): div.decompose() for script in text_div(["script", "style"]): script.decompose() text = text_div.get_text() if soup.original_encoding == 'cp1251': try: text = text.encode('cp1251').decode('utf-8') except UnicodeDecodeError: pass else: message = "Can not find text block for URL {}".format(link) self.logger.write_error(message) except URLError as e: message = "URLError with reason {}".format(e.reason) self.logger.write_error(message) except HTTPError as e: message = "HTTPError with code {} and reason {}".format( e.code, e.reason) self.logger.write_error(message) return text