def parse_rss(): url = 'https://allmychanges.com/rss/03afbe621916b2f2145f111075db0759/' today = datetime.date.today() week_before = today - datetime.timedelta(weeks=1) try: packages = { x.get('name').strip(): x for x in list(Package.objects.all() .values('name', 'description', 'link')) } _start_week, _end_week = get_start_end_of_week(today) _ = Issue.objects.filter(date_from=_start_week, date_to=_end_week) assert _.count() <= 1, 'Many ISSUE on week' _ = None if _.count() == 0 else _[0] news = Item.objects.filter(issue=_, status='active') if _ is not None else [] section = Section.objects.get(title='Релизы') resource = Resource.objects.get(link='http://allmychanges.com/') except Exception as e: print(e) return saved_packages = [] for n in feedparser.parse(url).entries: package_name, package_version = n.title.split() package_name = package_name.replace('python/', '') ct = len(Item.objects.filter(link=n.link, status='active')[0:1]) if ct or not ('python' in n.title): saved_packages.append(package_name) continue time_struct = getattr(n, 'published_parsed', None) if time_struct: _timestamp = mktime(time_struct) dt = datetime.datetime.fromtimestamp(_timestamp) if dt.date() < week_before: continue try: if not (package_name in packages.keys()) or package_name in saved_packages: continue if news and check_previous_news_of_package(news, packages.get( package_name)): off_other_release_news(news, packages.get(package_name)) item_data = _generate_release_item(package_version, n.link, resource, section, packages.get(package_name)) saved_packages.append(package_name) save_item(item_data) except Exception as e: print(e) continue
def import_python_weekly(issue_url, **kwargs): resource = Resource.objects.get(title='PythonWeekly') page = html.parse(issue_url) # a = requests.get(url).content blocks = page.getroot().find_class('bodyTable')[0].xpath('//span[@style="font-size:14px"]') for x in blocks: link = x.cssselect('a')[0] url = link.attrib['href'] title = link.text _text = x.getnext() if _text is None: continue text = etree.tostring(x.getnext()).decode('utf-8').replace('<br/>', '').strip() item_data = { 'title': title, 'link': url, 'raw_content': text, 'http_code': 200, 'content': text, 'description': text, 'resource': resource, 'language': 'en', } item_data.update( apply_parsing_rules(item_data, **kwargs) if kwargs.get('query_rules') else {}) item_data = apply_video_rules(item_data.copy()) save_item(item_data)
def import_rss(**kwargs): for src in AutoImportResource.objects.filter(type_res='rss', in_edit=False): try: rss_items = map( get_data_for_rss_item, filter(is_not_exists_rss_item, filter(_is_old_rss_news, get_items_from_rss(src.link)))) # parse weekly digests digests_items = list(rss_items) list( map(parse_weekly_digest, filter(is_weekly_digest, digests_items))) list( map(parse_django_weekly_digest, filter(is_django_weekly_digest, digests_items))) resource = src.resource language = src.language for i, rss_item in enumerate(digests_items): rss_item.update({ 'resource': resource, 'language': language, }) rss_item.update( apply_parsing_rules(rss_item, **kwargs) if kwargs. get('query_rules') else {}) rss_item.update(apply_video_rules(rss_item.copy())) save_item(rss_item) except (URLError, TooManyRedirects, socket.timeout): print(src)
def parse_rss(): url = 'https://allmychanges.com/rss/03afbe621916b2f2145f111075db0759/' today = datetime.date.today() week_before = today - datetime.timedelta(weeks=1) try: packages = { x.get('name').strip(): x for x in list(Package.objects.all().values('name', 'description', 'link')) } _start_week, _end_week = get_start_end_of_week(today) _ = Issue.objects.filter(date_from=_start_week, date_to=_end_week) assert _.count() <= 1, 'Many ISSUE on week' _ = None if _.count() == 0 else _[0] news = Item.objects.filter(issue=_, status='active') if _ is not None else [] section = Section.objects.get(title='Релизы') resource = Resource.objects.get(link='http://allmychanges.com/') except Exception as e: print(e) return saved_packages = [] for n in feedparser.parse(url).entries: package_name, package_version = n.title.split() package_name = package_name.replace('python/', '') ct = len(Item.objects.filter(link=n.link, status='active')[0:1]) if ct or not ('python' in n.title): saved_packages.append(package_name) continue time_struct = getattr(n, 'published_parsed', None) if time_struct: _timestamp = mktime(time_struct) dt = datetime.datetime.fromtimestamp(_timestamp) if dt.date() < week_before: continue try: if not (package_name in packages.keys()) \ or package_name in saved_packages: continue if news \ and check_previous_news_of_package( news, packages.get(package_name)): off_other_release_news(news, packages.get(package_name)) item_data = _generate_release_item(package_version, n.link, resource, section, packages.get(package_name)) saved_packages.append(package_name) save_item(item_data) except Exception as e: print(e) continue
def main(): url = 'http://feed.exileed.com/vk/feed/pynsk' _section_title = 'Колонка автора' _res_title = 'Александр Сапронов (PyNSK)' resource = Resource.objects.filter(title=_res_title) assert resource.count() == 1, "Not found resoure: %s" % _res_title resource = resource[0] section = Section.objects.filter(title=_section_title) assert section.count() == 1, "Not found section: %s" % _section_title section = section[0] r = re.compile(r"(htt(p|ps)://[^ ]+)") today = datetime.date.today() week_before = today - datetime.timedelta(weeks=1) rssnews = feedparser.parse(url) for n in reversed(rssnews.entries): if len(Item.objects.filter(link=n.link)[0:1]): continue # print("Parse: %s" % n.link) title = None content = None time_struct = getattr(n, 'published_parsed', None) if time_struct: _timestamp = mktime(time_struct) dt = datetime.datetime.fromtimestamp(_timestamp) if dt.date() < week_before: continue text = n.summary for x in l: if x in text and '<br><br>' in text.split(x)[1]: _ = text.split(x)[1].split('<br>') title = x + _[0] content = ' </br>\n'.join(filter(lambda x: x, _[1:])) content = r.sub(r'<a href="\1">\1</a>', content) break if title is not None and content is not None: content_link = "<a href='%s' target='_blank'>[Продолжение]</a>" % n.link content = textwrap.shorten(content, width=300, placeholder="...%s" % content_link)\ .replace('<a...', '...') _ = { 'link': n.link, 'description': content, 'title': title, 'resource': resource, 'language': 'ru', 'section': section, 'status': 'active', } save_item(_)
def parse_rss(): # todo # hardcode # это личная лента модератора # по возможности заменить на ленту спец. созданную для pydigest url = 'https://allmychanges.com/rss/05a5ec600331b03741bd08244afa11cb/' try: packages = {x.get('name'): x for x in list(Package.objects.all() .values('name', 'description', 'url'))} section = Section.objects.get(title=u'Релизы') resource = Resource.objects.get(link='http://allmychanges.com/') except Exception: return today = datetime.date.today() week_before = today - datetime.timedelta(weeks=1) saved_packages = [] for n in feedparser.parse(url).entries: package_name, package_version = n.title.split() package_name = package_name.replace('python/', '') ct = len(Item.objects.filter(link=n.link)[0:1]) if ct or not ('python' in n.title): saved_packages.append(package_name) continue time_struct = getattr(n, 'published_parsed', None) if time_struct: _timestamp = mktime(time_struct) dt = datetime.datetime.fromtimestamp(_timestamp) if dt.date() < week_before: continue try: if not (package_name in packages.keys()) or package_name in saved_packages: continue item_data = _generate_release_item( package_name, package_version, n.link, resource, section, packages.get(package_name) ) saved_packages.append(package_name) save_item(item_data) except Exception as e: continue
def parse_rss(): url = 'https://allmychanges.com/rss/03afbe621916b2f2145f111075db0759/' try: packages = {x.get('name').strip(): x for x in list(Package.objects.all() .values('name', 'description', 'url'))} section = Section.objects.get(title=u'Релизы') resource = Resource.objects.get(link='http://allmychanges.com/') except Exception: return today = datetime.date.today() week_before = today - datetime.timedelta(weeks=1) saved_packages = [] for n in feedparser.parse(url).entries: package_name, package_version = n.title.split() package_name = package_name.replace('python/', '') ct = len(Item.objects.filter(link=n.link)[0:1]) if ct or not ('python' in n.title): saved_packages.append(package_name) continue time_struct = getattr(n, 'published_parsed', None) if time_struct: _timestamp = mktime(time_struct) dt = datetime.datetime.fromtimestamp(_timestamp) if dt.date() < week_before: continue try: if not (package_name in packages.keys()) or package_name in saved_packages: continue item_data = _generate_release_item( package_name, package_version, n.link, resource, section, packages.get(package_name) ) saved_packages.append(package_name) save_item(item_data) except Exception as e: continue
def parse_rss(): url = 'https://allmychanges.com/rss/03afbe621916b2f2145f111075db0759/' try: packages = { x.get('name').strip(): x for x in list(Package.objects.all().values('name', 'description', 'url')) } section = Section.objects.get(title=u'Релизы') resource = Resource.objects.get(link='http://allmychanges.com/') except Exception: return today = datetime.date.today() week_before = today - datetime.timedelta(weeks=1) saved_packages = [] for n in feedparser.parse(url).entries: package_name, package_version = n.title.split() package_name = package_name.replace('python/', '') ct = len(Item.objects.filter(link=n.link)[0:1]) if ct or not ('python' in n.title): saved_packages.append(package_name) continue time_struct = getattr(n, 'published_parsed', None) if time_struct: _timestamp = mktime(time_struct) dt = datetime.datetime.fromtimestamp(_timestamp) if dt.date() < week_before: continue try: if not (package_name in packages.keys()) or package_name in saved_packages: continue item_data = _generate_release_item(package_name, package_version, n.link, resource, section, packages.get(package_name)) saved_packages.append(package_name) save_item(item_data) except Exception as e: continue
def main(url: str = "", number: int = "") -> None: data = { 'query_rules': ParsingRules.objects.filter(is_activated=True).all(), 'query_sections': Section.objects.all(), 'query_statuses': [x[0] for x in ITEM_STATUS_CHOICES], } _apply_rules = _apply_rules_wrap(**data) parser = ImportPythonParser() if number and not url: url = parser.get_issue_url(number) if not number and not url: url = parser.get_latest_issue_url() blocks = parser.get_blocks(url) with_rules_applied = map(_apply_rules, blocks) for block in with_rules_applied: save_item(block)
def import_tweets(**kwargs): for i in get_tweets(): # это помогает не парсить лишний раз ссылку, которая есть if Item.objects.filter(link=i[1]).exists(): continue # title = u'[!] %s' % i[0] if fresh_google_check(i[1]) else i[0] title = i[0] item_data = { 'title': title, 'link': i[1], 'http_code': i[3], 'resource': i[2] } data = apply_parsing_rules(item_data, **kwargs) if kwargs.get( 'query_rules') else {} item_data.update(data) save_item(item_data)
def import_tweets(**kwargs): for i in get_tweets(): try: # это помогает не парсить лишний раз ссылку, которая есть if Item.objects.filter(link=i[1]).exists(): continue # title = '[!] %s' % i[0] if fresh_google_check(i[1]) else i[0] title = i[0] item_data = {"title": title, "link": i[1], "http_code": i[3], "resource": i[2]} if is_weekly_digest(item_data): parse_weekly_digest(item_data) else: data = apply_parsing_rules(item_data, **kwargs) if kwargs.get("query_rules") else {} item_data.update(data) save_item(item_data) except (URLError, TooManyRedirects, socket.timeout): print(i)
def import_tweets(**kwargs): for i in get_tweets(): # это помогает не парсить лишний раз ссылку, которая есть if Item.objects.filter(link=i[1]).exists(): continue # title = u'[!] %s' % i[0] if fresh_google_check(i[1]) else i[0] title = i[0] item_data = { 'title': title, 'link': i[1], 'http_code': i[3], 'resource': i[2] } data = apply_parsing_rules( item_data, **kwargs) if kwargs.get('query_rules') else {} item_data.update(data) save_item(item_data)
def parse(): base_url = 'https://twitter.com/NewReleaseNotes/' packages = list(Package.objects.all().values('name', 'description', 'url')) if packages: try: section = Section.objects.get(title=u'Релизы') resource = Resource.objects.get(link='http://allmychanges.com/') except Exception: return tweets_data = get_tweets_by_url(base_url) for text, link, http_code in tweets_data: for x in packages: if 'python' in text and "python/%s" % x.get( 'name').lower() in text: name = u"{} - {}".format( x.get('name'), text.split(' of')[0] ) description = u"Вышла новая версия пакета {0} - {1}." \ u" {2}." \ u" Изменения описаны по ссылке <a href='{3}'>{3}</a>. " \ u"Скачать можно по ссылке: <a href='{4}'>{4}</a>".format( x.get('name'), text.split(' of')[0], x.get('description'), link, x.get('url') ) save_item({ 'title': name, 'link': link, 'resource': resource, 'status': 'active', 'section': section, 'language': 'en', 'description': description, })
def import_rss(**kwargs): for src in AutoImportResource.objects.filter(type_res="rss", in_edit=False): try: rss_items = map( get_data_for_rss_item, filter(is_not_exists_rss_item, filter(_is_old_rss_news, get_items_from_rss(src.link))), ) # parse weekly digests digests_items = list(rss_items) list(map(parse_weekly_digest, filter(is_weekly_digest, digests_items))) resource = src.resource language = src.language for i, rss_item in enumerate(digests_items): rss_item.update({"resource": resource, "language": language}) rss_item.update(apply_parsing_rules(rss_item, **kwargs) if kwargs.get("query_rules") else {}) rss_item.update(apply_video_rules(rss_item.copy())) save_item(rss_item) except (URLError, TooManyRedirects, socket.timeout): print(src)
def import_rss(**kwargs): for src in AutoImportResource.objects.filter(type_res='rss', in_edit=False): rssnews = feedparser.parse(src.link) today = datetime.date.today() week_before = today - datetime.timedelta(weeks=1) for n in rssnews.entries: ct = len(Item.objects.filter(link=n.link)[0:1]) if ct: continue time_struct = getattr(n, 'published_parsed', None) if time_struct: _timestamp = mktime(time_struct) dt = datetime.datetime.fromtimestamp(_timestamp) if dt.date() < week_before: continue title = n.title # title = u'[!] %s' % n.title if fresh_google_check( # n.title) else n.title http_code, content, raw_content = _get_http_data_of_url(n.link) item_data = { 'title': title, 'link': n.link, 'raw_content': raw_content, 'http_code': http_code, 'content': content, 'description': re.sub('<.*?>', '', n.summary), 'resource': src.resource, 'language': src.language, } item_data.update( apply_parsing_rules(item_data, **kwargs) if kwargs. get('query_rules') else {}) item_data = apply_video_rules(item_data.copy()) save_item(item_data)
def import_rss(**kwargs): for src in AutoImportResource.objects.filter(type_res='rss', in_edit=False): rssnews = feedparser.parse(src.link) today = datetime.date.today() week_before = today - datetime.timedelta(weeks=1) for n in rssnews.entries: ct = len(Item.objects.filter(link=n.link)[0:1]) if ct: continue time_struct = getattr(n, 'published_parsed', None) if time_struct: _timestamp = mktime(time_struct) dt = datetime.datetime.fromtimestamp(_timestamp) if dt.date() < week_before: continue title = n.title # title = u'[!] %s' % n.title if fresh_google_check( # n.title) else n.title http_code, content, raw_content = _get_http_data_of_url(n.link) item_data = { 'title': title, 'link': n.link, 'raw_content': raw_content, 'http_code': http_code, 'content': content, 'description': re.sub('<.*?>', '', n.summary), 'resource': src.resource, 'language': src.language, } item_data.update( apply_parsing_rules(item_data, **kwargs) if kwargs.get('query_rules') else {}) item_data = apply_video_rules(item_data.copy()) save_item(item_data)
def import_rss(**kwargs): for src in AutoImportResource.objects.filter(type_res='rss', in_edit=False): rss_items = map(get_data_for_rss_item, filter(is_not_exists_rss_item, filter(_is_old_rss_news, get_items_from_rss(src.link)))) # parse weekly digests digests_items = list(rss_items) list(map(parse_weekly_digest, filter(is_weekly_digest, digests_items))) resource = src.resource language = src.language for i, rss_item in enumerate(digests_items): rss_item.update({ 'resource': resource, 'language': language, }) rss_item.update(apply_parsing_rules(rss_item, **kwargs) if kwargs.get('query_rules') else {}) rss_item.update(apply_video_rules(rss_item.copy())) save_item(rss_item)
def import_tweets(**kwargs): for i in get_tweets(): try: # это помогает не парсить лишний раз ссылку, которая есть if Item.objects.filter(link=i[1]).exists(): continue # title = '[!] %s' % i[0] if fresh_google_check(i[1]) else i[0] title = i[0] item_data = { 'title': title, 'link': i[1], 'http_code': i[3], 'resource': i[2] } if is_weekly_digest(item_data): parse_weekly_digest(item_data) else: data = apply_parsing_rules( item_data, **kwargs) if kwargs.get('query_rules') else {} item_data.update(data) save_item(item_data) except (URLError, TooManyRedirects, socket.timeout): print(i)