def import_rss(**kwargs): for src in AutoImportResource.objects.filter(type_res='rss', in_edit=False): try: rss_items = map( get_data_for_rss_item, filter(is_not_exists_rss_item, filter(_is_old_rss_news, get_items_from_rss(src.link)))) # parse weekly digests digests_items = list(rss_items) list( map(parse_weekly_digest, filter(is_weekly_digest, digests_items))) list( map(parse_django_weekly_digest, filter(is_django_weekly_digest, digests_items))) resource = src.resource language = src.language for i, rss_item in enumerate(digests_items): rss_item.update({ 'resource': resource, 'language': language, }) rss_item.update( apply_parsing_rules(rss_item, **kwargs) if kwargs. get('query_rules') else {}) rss_item.update(apply_video_rules(rss_item.copy())) save_item(rss_item) except (URLError, TooManyRedirects, socket.timeout): print(src)
def import_python_weekly(issue_url, **kwargs): resource = Resource.objects.get(title='PythonWeekly') page = html.parse(issue_url) # a = requests.get(url).content blocks = page.getroot().find_class('bodyTable')[0].xpath('//span[@style="font-size:14px"]') for x in blocks: link = x.cssselect('a')[0] url = link.attrib['href'] title = link.text _text = x.getnext() if _text is None: continue text = etree.tostring(x.getnext()).decode('utf-8').replace('<br/>', '').strip() item_data = { 'title': title, 'link': url, 'raw_content': text, 'http_code': 200, 'content': text, 'description': text, 'resource': resource, 'language': 'en', } item_data.update( apply_parsing_rules(item_data, **kwargs) if kwargs.get('query_rules') else {}) item_data = apply_video_rules(item_data.copy()) save_item(item_data)
def import_rss(**kwargs): for src in AutoImportResource.objects.filter(type_res="rss", in_edit=False): try: rss_items = map( get_data_for_rss_item, filter(is_not_exists_rss_item, filter(_is_old_rss_news, get_items_from_rss(src.link))), ) # parse weekly digests digests_items = list(rss_items) list(map(parse_weekly_digest, filter(is_weekly_digest, digests_items))) resource = src.resource language = src.language for i, rss_item in enumerate(digests_items): rss_item.update({"resource": resource, "language": language}) rss_item.update(apply_parsing_rules(rss_item, **kwargs) if kwargs.get("query_rules") else {}) rss_item.update(apply_video_rules(rss_item.copy())) save_item(rss_item) except (URLError, TooManyRedirects, socket.timeout): print(src)
def import_rss(**kwargs): for src in AutoImportResource.objects.filter(type_res='rss', in_edit=False): rssnews = feedparser.parse(src.link) today = datetime.date.today() week_before = today - datetime.timedelta(weeks=1) for n in rssnews.entries: ct = len(Item.objects.filter(link=n.link)[0:1]) if ct: continue time_struct = getattr(n, 'published_parsed', None) if time_struct: _timestamp = mktime(time_struct) dt = datetime.datetime.fromtimestamp(_timestamp) if dt.date() < week_before: continue title = n.title # title = u'[!] %s' % n.title if fresh_google_check( # n.title) else n.title http_code, content, raw_content = _get_http_data_of_url(n.link) item_data = { 'title': title, 'link': n.link, 'raw_content': raw_content, 'http_code': http_code, 'content': content, 'description': re.sub('<.*?>', '', n.summary), 'resource': src.resource, 'language': src.language, } item_data.update( apply_parsing_rules(item_data, **kwargs) if kwargs.get('query_rules') else {}) item_data = apply_video_rules(item_data.copy()) save_item(item_data)
def import_rss(**kwargs): for src in AutoImportResource.objects.filter(type_res='rss', in_edit=False): rssnews = feedparser.parse(src.link) today = datetime.date.today() week_before = today - datetime.timedelta(weeks=1) for n in rssnews.entries: ct = len(Item.objects.filter(link=n.link)[0:1]) if ct: continue time_struct = getattr(n, 'published_parsed', None) if time_struct: _timestamp = mktime(time_struct) dt = datetime.datetime.fromtimestamp(_timestamp) if dt.date() < week_before: continue title = n.title # title = u'[!] %s' % n.title if fresh_google_check( # n.title) else n.title http_code, content, raw_content = _get_http_data_of_url(n.link) item_data = { 'title': title, 'link': n.link, 'raw_content': raw_content, 'http_code': http_code, 'content': content, 'description': re.sub('<.*?>', '', n.summary), 'resource': src.resource, 'language': src.language, } item_data.update( apply_parsing_rules(item_data, **kwargs) if kwargs. get('query_rules') else {}) item_data = apply_video_rules(item_data.copy()) save_item(item_data)
def import_rss(**kwargs): for src in AutoImportResource.objects.filter(type_res='rss', in_edit=False): rss_items = map(get_data_for_rss_item, filter(is_not_exists_rss_item, filter(_is_old_rss_news, get_items_from_rss(src.link)))) # parse weekly digests digests_items = list(rss_items) list(map(parse_weekly_digest, filter(is_weekly_digest, digests_items))) resource = src.resource language = src.language for i, rss_item in enumerate(digests_items): rss_item.update({ 'resource': resource, 'language': language, }) rss_item.update(apply_parsing_rules(rss_item, **kwargs) if kwargs.get('query_rules') else {}) rss_item.update(apply_video_rules(rss_item.copy())) save_item(rss_item)
def _apply_rules(item: dict) -> dict: item.update( apply_parsing_rules(item, **rules) if kwargs.get('query_rules') else {}) item.update(apply_video_rules(item)) return item
def _apply_rules(item: dict) -> dict: item.update( apply_parsing_rules(item, **rules) if kwargs.get('query_rules' ) else {}) item.update(apply_video_rules(item)) return item