def handle(self, *args, **options): print(join(settings.BASE_DIR, options['file'])) json_file = open(join(settings.BASE_DIR, options['file'])) dados = json.load(json_file) users = dados['users'] for u in users: u['address'].pop('geo') address = Address(**u['address']) address.save() #user = User() #user.username = u['username'] #user.email = u['email'] password = '******' user = User.objects.create_user(u['username'], u['email'], password) user.address = address user.save() profile = Profile() profile.user = user profile.address = address profile.save() posts = dados['posts'] for p in posts: post = Post() post.profile = Profile.objects.get(user=User.objects.get( id=p['userId'])) post.body = p.get('body') or 'sem corpo' post.title = p.get('title') or 'sem title' post.save() comments = dados['comments'] for c in comments: com = Comment() com.name = c['name'] com.email = c['email'] com.body = c['body'] com.post = Post.objects.get(pk=c['postId']) com.save()
def parse_feed_json(source_feed, feed_content, output): ok = True changed = False try: f = json.loads(feed_content) entries = f["items"] if len(entries): source_feed.last_success = ( timezone.now() ) # in case we start auto unsubscribing long dead feeds else: source_feed.last_result = "Feed is empty" source_feed.interval += 120 ok = False except Exception as ex: source_feed.last_result = "Feed Parse Error" entries = [] source_feed.interval += 120 ok = False if ok: if "expired" in f and f["expired"]: # This feed says it is done # TODO: permanently disable # for now source_feed.interval to max source_feed.interval = 24 * 3 * 60 source_feed.last_result = "This feed has expired" return (False, False, source_feed.interval) try: source_feed.site_url = f["home_page_url"] source_feed.name = update_source_name(source_feed.name, f["title"]) except Exception as ex: pass if "description" in f: _customize_sanitizer(parser) source_feed.description = parser._sanitizeHTML( f["description"], "utf-8", "text/html") _customize_sanitizer(parser) source_feed.name = update_source_name( source_feed.name, parser._sanitizeHTML(source_feed.name, "utf-8", "text/html"), ) if "icon" in f: source_feed.image_url = f["icon"] # output.write(entries) entries.reverse( ) # Entries are typically in reverse chronological order - put them in right order for e in entries: body = " " if "content_text" in e: body = e["content_text"] if "content_html" in e: body = e["content_html"] # prefer html over text body = fix_relative(body, source_feed.site_url) try: guid = e["id"] except Exception as ex: try: guid = e["url"] except Exception as ex: m = hashlib.md5() m.update(body.encode("utf-8")) guid = m.hexdigest() try: p = Post.objects.filter(source=source_feed).filter( guid=guid)[0] output.write("EXISTING " + guid + "\n") except Exception as ex: output.write("NEW " + guid + "\n") p = Post(index=0, body=" ") p.found = timezone.now() changed = True p.source = source_feed try: title = e["title"] except Exception as ex: title = "" # borrow the RSS parser's sanitizer _customize_sanitizer(parser) body = parser._sanitizeHTML( body, "utf-8", "text/html") # TODO: validate charset ?? _customize_sanitizer(parser) title = parser._sanitizeHTML( title, "utf-8", "text/html") # TODO: validate charset ?? # no other fields are ever marked as |safe in the templates if "banner_image" in e: p.image_url = e["banner_image"] if "image" in e: p.image_url = e["image"] try: p.link = e["url"] except Exception as ex: p.link = "" p.title = title try: p.created = pyrfc3339.parse(e["date_published"]) except Exception as ex: output.write("CREATED ERROR") p.created = timezone.now() p.guid = guid try: p.author = e["author"] except Exception as ex: p.author = "" p.save() try: seen_files = [] for ee in list(p.enclosures.all()): # check existing enclosure is still there found_enclosure = False if "attachments" in e: for pe in e["attachments"]: if pe["url"] == ee.href and ee.href not in seen_files: found_enclosure = True try: ee.length = int(pe["size_in_bytes"]) except: ee.length = 0 try: file_type = pe["mime_type"] except: file_type = "audio/mpeg" # we are assuming podcasts here but that's probably not safe ee.type = file_type ee.save() break if not found_enclosure: ee.delete() seen_files.append(ee.href) if "attachments" in e: for pe in e["attachments"]: try: if pe["url"] not in seen_files: try: length = int(pe["size_in_bytes"]) except: length = 0 try: filetype = pe["mime_type"] except: filetype = "audio/mpeg" ee = Enclosure(post=p, href=pe["url"], length=length, type=filetype) ee.save() except Exception as ex: pass except Exception as ex: if output: output.write("No enclosures - " + str(ex)) try: p.body = body p.save() # output.write(p.body) except Exception as ex: output.write(str(ex)) output.write(p.body) try: if "tags" in e: for t in e["tags"]: tag, created = Tag.objects.get_or_create(**t) p.tags.add(tag) print(f"Tag {tag} added to post {p}") except Exception as ex: output.write(str(ex)) output.write(f"couldn't add tag {tag} to post {p}") return (ok, changed)
def parse_feed_xml(source_feed, feed_content, output): ok = True changed = False # output.write(ret.content) try: _customize_sanitizer(parser) f = parser.parse( feed_content) # need to start checking feed parser errors here entries = f["entries"] if len(entries): source_feed.last_success = ( timezone.now() ) # in case we start auto unsubscribing long dead feeds else: source_feed.last_result = "Feed is empty" ok = False except Exception as ex: source_feed.last_result = "Feed Parse Error" entries = [] ok = False if ok: try: source_feed.name = update_source_name(source_feed.name, f.feed.title) except Exception: pass try: source_feed.site_url = f.feed.link except Exception: pass try: source_feed.image_url = f.feed.image.href except Exception: pass # either of these is fine, prefer description over summary # also feedparser will give us itunes:summary etc if there try: source_feed.description = f.feed.summary except Exception: pass try: source_feed.description = f.feed.description except Exception: pass # output.write(entries) entries.reverse( ) # Entries are typically in reverse chronological order - put them in right order for e in entries: # we are going to take the longest body = "" if hasattr(e, "content"): for c in e.content: if len(c.value) > len(body): body = c.value if hasattr(e, "summary"): if len(e.summary) > len(body): body = e.summary if hasattr(e, "summary_detail"): if len(e.summary_detail.value) > len(body): body = e.summary_detail.value if hasattr(e, "description"): if len(e.description) > len(body): body = e.description body = fix_relative(body, source_feed.site_url) try: guid = e.guid except Exception as ex: try: guid = e.link except Exception as ex: m = hashlib.md5() m.update(body.encode("utf-8")) guid = m.hexdigest() try: p = Post.objects.filter(source=source_feed).filter( guid=guid)[0] output.write("EXISTING " + guid + "\n") except Exception as ex: output.write("NEW " + guid + "\n") p = Post(index=0, body=" ") p.found = timezone.now() changed = True p.source = source_feed try: title = e.title except Exception: title = "" try: p.link = e.link except Exception: p.link = "" p.title = title try: p.image_url = e.image.href except Exception: pass try: # If there is no published_parsed entry, try updated_parsed if "published_parsed" in e: time_struct = e.published_parsed else: time_struct = e.updated_parsed p.created = datetime.datetime.fromtimestamp( time.mktime(time_struct)).replace(tzinfo=timezone.utc) except Exception: output.write("CREATED ERROR") p.guid = guid try: p.author = e.author except Exception as ex: p.author = "" try: p.save() # output.write(p.body) except Exception as ex: # import pdb; pdb.set_trace() output.write(str(ex)) try: seen_files = [] for ee in list(p.enclosures.all()): # check existing enclosure is still there found_enclosure = False for pe in e["enclosures"]: if pe["href"] == ee.href and ee.href not in seen_files: found_enclosure = True try: ee.length = int(pe["length"]) except Exception: ee.length = 0 try: file_type = pe["type"] except Exception: file_type = "audio/mpeg" # we are assuming podcasts here but that's probably not safe ee.type = file_type ee.save() break if not found_enclosure: ee.delete() seen_files.append(ee.href) for pe in e["enclosures"]: try: if pe["href"] not in seen_files: try: length = int(pe["length"]) except Exception: length = 0 try: file_type = pe["type"] except Exception: file_type = "audio/mpeg" ee = Enclosure(post=p, href=pe["href"], length=length, type=file_type) ee.save() except Exception: pass except Exception as ex: if output: output.write("No enclosures - " + str(ex)) try: p.body = body p.save() # output.write(p.body) except Exception as ex: output.write(str(ex)) output.write(p.body) try: if "tags" in e: for t in e.tags: tag, created = Tag.objects.get_or_create(**t) p.tags.add(tag) print(f"Tag {tag} added to post {p}") except Exception as ex: output.write(str(ex)) output.write(f"couldn't add tag {tag} to post {p}") return (ok, changed)
def update_site_feed(feed, site_id): '''This functions handles the feed update of site and is kind of recursive, since in the end it will call another apply_async onto himself''' from feeds.models import Post, Site from feeds.utils import get_sanitized_html # Avoids running two instances at the time # Update task_id for this site site = Site.objects.get(id=site_id) site.task_id = update_site_feed.request.id site.save() # Update this site info if 'feed' not in feed: logger.warn(u"Site {} feed did not returned feed information".format(site.id)) if 'feed_error' in feed: logger.error('Site {} is with its feed url broken'.format(site.id)) # TODO: Create a task to use site.url to discover its new feed location site.feed_errors += 1 site.save() else: info = feed['feed'] if 'title' in info: site.title = info['title'] # For some reason, some Google Alerts returns a not valid FQDN info after parsed # and then we must check if it starts with "http" if 'link' in info and info['link'].startswith('http'): site.url = info['link'] if site.feed_errors > 0: site.feed_errors = 0 site.save() # Create posts if 'entries' not in feed: logger.warn(u"Site {} feed did not returned any post".format(site.id)) else: new_posts_found = 0 for entry in feed['entries']: # Without link we can't save this post if 'link' not in entry: continue url = entry['link'] title = entry.get('title', '') # Try to get content if isinstance(entry.get('content'), list): try: for content in entry['content']: if content: break except IndexError: content = u'' else: content = entry.get('content') if not content and 'description' in entry: content = entry['description'] if isinstance(content, dict): content = content.get('value') # Still no content found, lets try using summary if not content and entry.get('summary'): content = entry['summary'] # Parses the content to avoid broken HTML and script tags content = get_sanitized_html(content) author = entry.get('author') if 'published_parsed' in entry and entry.get('published_parsed'): created_at = datetime.datetime.fromtimestamp(time.mktime(entry['published_parsed'])) try: created_at = make_aware(created_at, get_current_timezone()) except AmbiguousTimeError: logger.error('Failed when tring to make {} aware'.format(created_at)) created_at = timezone.now() else: created_at = timezone.now() try: post, created = site.posts.get_or_create( url_hash=Post.hashurl(url), defaults={ 'title': title, 'url': url, 'content': content, 'author': author, 'created_at': created_at } ) except IntegrityError: # Raised when two posts have the same URL logger.warn('Final URL {} is duplicated'.format(url)) pass else: if created: new_posts_found += 1 logger.info( 'Site {site_id} got {new} new posts from {total} in feed'.format( site_id=site.id, new=new_posts_found, total=len(feed['entries']) ) ) # Updates when is it to run again next_update = site.set_next_update(save=False) logger.info("Site's {} next update at {}".format(site.id, next_update)) site.last_update = timezone.now() site.save()
def parse_feed_json(source_feed, feed_content, output): ok = True changed = False try: f = json.loads(feed_content) entries = f['items'] if entries: source_feed.last_success = timezone.now( ) #in case we start auto unsubscribing long dead feeds else: source_feed.last_result = "Feed is empty" source_feed.interval += 120 ok = False except Exception as ex: source_feed.last_result = "Feed Parse Error" entries = [] source_feed.interval += 120 ok = False if ok: if "expired" in f and f["expired"]: # This feed says it is done # TODO: permanently disable # for now source_feed.interval to max source_feed.interval = (24 * 3 * 60) source_feed.last_result = "This feed has expired" return (False, False, source_feed.interval) try: source_feed.site_url = f["home_page_url"] if not source_feed.name: source_feed.name = f["title"] except Exception as ex: pass if "description" in f: _customize_sanitizer(feedparser) source_feed.description = feedparser._sanitizeHTML( f["description"], "utf-8", 'text/html') _customize_sanitizer(feedparser) if not source_feed.name: source_feed.name = feedparser._sanitizeHTML( source_feed.name, "utf-8", 'text/html') if "icon" in f: source_feed.image_url = f["icon"] entries.reverse( ) # Entries are typically in reverse chronological order - put them in right order for e in entries: body = " " if "content_text" in e: body = e["content_text"] if "content_html" in e: body = e["content_html"] # prefer html over text body = fix_relative(body, source_feed.site_url) try: guid = e["id"] except Exception as ex: try: guid = e["url"] except Exception as ex: m = hashlib.md5() m.update(body.encode("utf-8")) guid = m.hexdigest() try: p = Post.objects.filter(source=source_feed).filter( guid=guid)[0] logging.info("EXISTING: %s", guid) except Exception as ex: logging.info("Creating new post %s.", guid) p = Post(index=0, body=' ') p.found = timezone.now() changed = True p.source = source_feed try: title = e["title"] except Exception as ex: title = "" # borrow the RSS parser's sanitizer _customize_sanitizer(feedparser) body = feedparser._sanitizeHTML( body, "utf-8", 'text/html') # TODO: validate charset ?? _customize_sanitizer(feedparser) title = feedparser._sanitizeHTML( title, "utf-8", 'text/html') # TODO: validate charset ?? # no other fields are ever marked as |safe in the templates if "banner_image" in e: p.image_url = e["banner_image"] if "image" in e: p.image_url = e["image"] try: p.link = e["url"] except Exception as ex: p.link = '' p.title = title try: p.created = pyrfc3339.parse(e["date_published"]) except Exception as ex: logging.exception('Unable to parse published date.') p.created = timezone.now() p.guid = guid try: p.author = e["author"] except Exception as ex: p.author = "" p.save() try: seen_files = [] for ee in list(p.enclosures.all()): # check existing enclosure is still there found_enclosure = False if "attachments" in e: for pe in e["attachments"]: if pe["url"] == ee.href and ee.href not in seen_files: found_enclosure = True ee.length = int( pe.get("size_in_bytes", None) or 0) typ = pe.get("mime_type", None) or "audio/mpeg" ee.type = typ ee.save() break # DANGEROUS! This deletes everything if a glitch in the feed removes enclosures. # if not found_enclosure: # ee.delete() seen_files.append(ee.href) if "attachments" in e: for pe in e["attachments"]: try: # Since many RSS feeds embed trackers into their URL that constantly change, yet almost always only include a single enclosure, # we'll only create a new enclosure when we see a new url if there are no enclosure records created yet. # This is a most robust way of preventing logical duplicates due to tracker URL changes then by trying to predict and strip out # all known tracker prefixes. if pe["url"] not in seen_files and not p.enclosures.all( ).exists(): length = int( pe.get("size_in_bytes", None) or 0) typ = pe.get("mime_type", None) or "audio/mpeg" ee = Enclosure(post=p, href=pe["url"], length=length, type=typ) ee.save() except Exception as ex: pass except Exception as ex: logging.exception("No enclosures") try: p.body = body p.save() except Exception as ex: logging.exception('Unable to save body A2.') return (ok, changed)
def update_site_feed(site): '''This functions handles the feed update of site and is kind of recursive, since in the end it will call another apply_async onto himself''' # Avoids running two instances at the time cachekey = SITE_WORKER_CACHE_KEY.format(id=site.id) if cache.get(cachekey): logger.warn('Worker for site {} still running'.format(site.id)) return cache.add(cachekey, '1', 60) # Will not run again in 60 seconds from feeds.models import Post # Update task_id for this site site.task_id = update_site_feed.request.id site.save() feed = site.getfeed() # Update this site info if not 'feed' in feed: logger.warn(u"Site {} feed did not returned feed information".format(site.id)) if 'feed_error' in feed: logger.error('Site {} is with its feed url broken'.format(site.id)) # TODO: Create a task to use site.url to discover its new feed location site.feed_errors += 1 site.save() else: info = feed['feed'] if 'title' in info: site.title = info['title'] if 'link' in info: site.url = info['link'] if site.feed_errors > 0: site.feed_errors = 0 site.save() # Create posts if not 'entries' in feed: logger.warn(u"Site {} feed did not returned any post".format(site.id)) else: new_posts_found = 0 for entry in feed['entries']: # Without link we can't save this post if not 'link' in entry: continue url = entry['link'] title = entry.get('title', '') # Try to get content if isinstance(entry.get('content'), list): try: for content in entry['content']: if content: break except IndexError: content = u'' else: content = entry.get('content') if not content and 'description' in entry: content = entry['description'] if isinstance(content, dict): content = content.get('value') author = entry.get('author') if 'published_parsed' in entry and entry.get('published_parsed'): created_at = datetime.datetime.fromtimestamp(time.mktime(entry['published_parsed'])) try: created_at = make_aware(created_at, get_current_timezone()) except AmbiguousTimeError,e: logger.error('Failed when tring to make {} aware'.format(created_at)) created_at = timezone.now() else: created_at = timezone.now() try: post, created = site.posts.get_or_create(url_hash=Post.hashurl(url), defaults={ 'title': title, 'url': url, 'content': content, 'author': author, } ) except IntegrityError: # Raised when two posts have the same URL pass else: if created: new_posts_found += 1 post.created_at = created_at post.save() logger.info('Site {site_id} got {new} new posts from {total} in feed'.format(site_id=site.id, new=new_posts_found, total=len(feed['entries'])))
address.save() user = User() user.username = u['username'] user.email = u['email'] user.password = '******' user.address = address user.save() profile = Profile() profile.user = user profile.address = address profile.save() posts = dados['posts'] for p in posts: post = Post() post.profile = Profile.objects.get(user=User.objects.get(id=p['userId'])) post.body = p.get('body') or 'sem corpo' post.title = p.get('title') or 'sem title' post.save() comments = dados['comments'] for c in comments: com = Comment() com.name = c['name'] com.email = c['email'] com.body = c['body'] com.post = Post.objects.get(pk=c['postId']) com.save()