def createPost(entry, planet_feed): title = entry.get("title", "") url = entry.get("link") guid = entry.get("link") content = entry.get('description') or entry.get("content", [{"value": ""}])[0]["value"] comments_url = entry.get("comments") date_modified = entry.get("updated_parsed") or entry.get("published_parsed") try: date_modified = datetime.fromtimestamp( time.mktime(date_modified)) except Exception: date_modified = None post = Post(title=title, url=url, guid=guid, content=content, comments_url=comments_url, date_modified=date_modified, feed=planet_feed) # To have the feed entry in the pre_save signal post.entry = entry post.save() return post
def process_feed(feed_url, create=False): """ Stores a feed, its related data, its entries and their related data. If create=True then it creates the feed, otherwise it only stores new entries and their related data. """ def normalize_tag(tag): """ converts things like "-noise-" to "noise" and "- noise -" to "noise" """ if tag.startswith("-"): tag = tag[1:] if tag.endswith("-"): tag = tag[:-1] tag = tag.strip() return tag try: USER_AGENT = settings.USER_AGENT except AttributeError: # print "Please set the variable USER_AGENT = <string> in your settings.py" # exit(0) raise ValidationError("Please set the variable USER_AGENT = <string> in your settings.py") feed_url = str(feed_url).strip() try: planet_feed = Feed.objects.get(url=feed_url) except Feed.DoesNotExist: planet_feed = None print "*" * 20 print "Feed: %s" % feed_url if create and planet_feed: # can't create it due to it already exists # print "This feed already exists!" # exit(0) raise ValidationError("This feed already exists!") if not create and not planet_feed: # can't update it due to it does not exist # print "This feed does not exist!" # exit(0) raise ValidationError("This feed does not exist!") # retrieve and parse feed using conditional GET method if not create: modified = datetime.timetuple(planet_feed.last_modified) etag = planet_feed.etag # update last checked datetime planet_feed.last_checked = datetime.now() planet_feed.save() else: modified = etag = None document = feedparser.parse(feed_url, agent=USER_AGENT, modified=modified, etag=etag) current_site = Site.objects.get(pk=settings.SITE_ID) if create: # then create blog, feed, generator, feed links and feed tags title = document.feed.get("title", "--") subtitle = document.feed.get("subtitle") blog_url = document.feed.get("link") rights = document.feed.get("rights") or document.feed.get("license") info = document.feed.get("info") guid = document.feed.get("id") image_url = document.feed.get("image", {}).get("href") icon_url = document.feed.get("icon") language = document.feed.get("language") etag = document.get("etag", "") last_modified = document.get("updated_parsed", datetime.now()) feed_links = document.feed.get("links", []) if not blog_url: link = filter(lambda item: item["rel"] == "alternate", feed_links) if link: blog_url = link[0]["href"] try: blog, created = Blog.objects.get_or_create(url=blog_url, defaults={"title": title}) except: raise ValidationError( "Sorry, it doesn't look like this feed is formatted properly. Are you sure it's a valid RSS feed?" ) generator_dict = document.feed.get("generator_detail", {}) if generator_dict: generator, created = Generator.objects.get_or_create( name=generator_dict.get("name", "--"), link=generator_dict.get("link"), version=generator_dict.get("version"), ) else: generator = None planet_feed = Feed( title=title, subtitle=subtitle, blog=blog, url=feed_url, rights=rights, info=info, guid=guid, image_url=image_url, icon_url=icon_url, language=language, etag=etag, last_modified=last_modified, generator=generator, is_active=True, last_checked=datetime.now(), site=current_site, ) planet_feed.save() for tag_dict in document.feed.get("tags", []): name = tag_dict.get("term") if name: print name for link_dict in feed_links: feed_link, created = FeedLink.objects.get_or_create( feed=planet_feed, rel=link_dict.get("rel", "--"), mime_type=link_dict.get("type"), link=link_dict.get("href", blog_url), ) entries = [] total_results = int(document.feed.get("opensearch_totalresults", len(document.entries))) items_per_page = int(document.feed.get("opensearch_itemsperpage", 25)) new_posts_count = 0 if total_results == 0: print "No entries to store. status: %s %s" % (document.get("status"), document.get("debug_message")) else: print "Entries total count: %d" % total_results stop_retrieving = False while (total_results > len(entries)) and not stop_retrieving: # retrieve and store feed posts entries.extend(document.entries) print "Processing %d entries" % len(document.entries) for entry in document.entries: title = entry.get("title", "") url = entry.get("link") guid = entry.get("link") content = entry.get("description") or entry.get("content", [{"value": ""}])[0]["value"] comments_url = entry.get("comments") date_modified = entry.get("updated_parsed") or entry.get("published_parsed") try: date_modified = datetime.fromtimestamp(time.mktime(date_modified)) except Exception: date_modified = None try: if len(Post.objects.filter(url=url, guid=guid)): raise PostAlreadyExists post = Post( title=title, url=url, guid=guid, content=content, comments_url=comments_url, date_modified=date_modified, feed=planet_feed, ) # To have the feed entry in the pre_save signal post.entry = entry post.save() except PostAlreadyExists: print "Skipping post %s (%s) because already exists" % (guid, url) if not create: # if it is in update-mode then stop retrieving when # it finds repeated posts stop_retrieving = True else: new_posts_count += 1 # create post tags... tag_list = "" for tag_dict in entry.get("tags", []): tag_name = tag_dict.get("term") or tag_dict.get("label") tag_name = tag_name[:255] tag_name = normalize_tag(tag_name) try: if "/" in tag_name: # For path based categories for subtag in tag_name.split("/"): if subtag: # empty string if starts/ends with slash Tag.objects.add_tag(post, '"%s"' % subtag) tag_list = "%s %s" % (tag_list, subtag) else: Tag.objects.add_tag(post, '"%s"' % tag_name) tag_list = "%s %s" % (tag_list, tag_name) except AttributeError, e: print "Ignoring tag error: %s" % e post.tags = tag_list post.save() # create post links... for link_dict in entry.get("links", []): post_link, created = PostLink.objects.get_or_create( post=post, rel=link_dict.get("rel", "--"), mime_type=link_dict.get("type"), link=link_dict.get("href", "--"), title=link_dict.get("title", "--"), ) # create and store enclosures... # NEW: # media:thumbnail has attributes: url, height, width, time. # see: http://www.rssboard.org/media-rss#media-thumbnails # check if it's a list, cast to list if it's not # iterate, and add as an enclosure # store height, etc, in json (write a method on closure) media_thumbnails = entry.get("media_thumbnail", False) if media_thumbnails: if not isinstance(media_thumbnails, list): media_thumbnails = [media_thumbnails] for media_thumbnail in media_thumbnails: mime_type, enc = mimetypes.guess_type(urlparse(media_thumbnail.get("url")).path) extra_info = {} extra_info["width"] = media_thumbnail.get("width", None) extra_info["height"] = media_thumbnail.get("height", None) extra_info["time"] = media_thumbnail.get("time", None) post_enclosure, created = Enclosure.objects.get_or_create( post=post, length=0, mime_type=mime_type, link=media_thumbnail.get("url"), extra_info=extra_info, ) # OLD: # if entry.get('media_thumbnail', False): # mime_type, enc = mimetypes.guess_type(urlparse(entry.get('media_thumbnail').href).path) # post_enclosure, created = Enclosure.objects.get_or_create( # post=post, # length=0, # mime_type=mime_type, # link=entry.get('media_thumbnail').href # ) for enclosure_dict in entry.get("enclosures", []): post_enclosure = Enclosure( post=post, length=enclosure_dict.get("length", 0), mime_type=enclosure_dict.get("type", ""), link=enclosure_dict.get("href"), ) post_enclosure.save() # create and store author... author_dict = entry.get("author_detail") if author_dict: author, created = Author.objects.get_or_create( name=author_dict.get("name", ""), email=author_dict.get("email", ""), profile_url=author_dict.get("href"), ) try: PostAuthorData.objects.get(author=author, post=post) except PostAuthorData.DoesNotExist: pad = PostAuthorData(author=author, post=post) pad.save() # create and store contributors... for contributor_dict in entry.get("contributors", []): contributor, created = Author.objects.get_or_create( name=author_dict.get("name", ""), email=author_dict.get("email", ""), profile_url=contributor_dict.get("href"), ) try: PostAuthorData.objects.get(author=contributor, post=post) except PostAuthorData.DoesNotExist: pad = PostAuthorData(author=contributor, post=post, is_contributor=True) pad.save() # We send a post_created signal print "post_created.send(sender=post)", post post_created.send(sender=post, instance=post) if not stop_retrieving: opensearch_url = "%s?start-index=%d&max-results=%d" % (feed_url, len(entries) + 1, items_per_page) print "retrieving %s..." % opensearch_url document = feedparser.parse(opensearch_url, agent=USER_AGENT) if new_posts_count: # update last modified datetime planet_feed.last_modified = datetime.now() planet_feed.save() print "%d posts were created. Done." % new_posts_count
def process_feed(feed_url, create=False, category_title=None): """ Stores a feed, its related data, its entries and their related data. If create=True then it creates the feed, otherwise it only stores new entries and their related data. """ def normalize_tag(tag): """ converts things like "-noise-" to "noise" and "- noise -" to "noise" """ if tag.startswith("-"): tag = tag[1:] if tag.endswith("-"): tag = tag[:-1] ## fix for HTML entities tag = unicode(BeautifulStoneSoup(tag, convertEntities=BeautifulStoneSoup.HTML_ENTITIES )) tag = tag.strip().lower() return tag try: USER_AGENT = settings.PLANET["USER_AGENT"] except (KeyError, AttributeError): print """Please set PLANET = {" USER_AGENT": <string>} in your settings.py""" exit(0) feed_url = str(feed_url).strip() try: planet_feed = Feed.objects.get(url=feed_url) except Feed.DoesNotExist: planet_feed = None print "*" * 20 print "Feed: %s" % feed_url if create and planet_feed: # can't create it due to it already exists print "This feed already exists!" exit(0) if not create and not planet_feed: # can't update it due to it does not exist print "This feed does not exist!" exit(0) # retrieve and parse feed using conditional GET method if not create: modified = datetime.timetuple(planet_feed.last_modified) etag = planet_feed.etag # update last checked datetime planet_feed.last_checked = datetime.now() planet_feed.save() else: modified = etag = None document = feedparser.parse(feed_url, agent=USER_AGENT, modified=modified, etag=etag) current_site = Site.objects.get(pk=settings.SITE_ID) if create: # then create blog, feed, generator, feed links and feed tags title = document.feed.get("title", "--") subtitle = document.feed.get("subtitle") blog_url = document.feed.get("link") rights = document.feed.get("rights") or document.feed.get("license") info = document.feed.get("info") guid = unicode(md5(document.feed.get("link")).hexdigest()) image_url = document.feed.get("image", {}).get("href") icon_url = document.feed.get("icon") language = document.feed.get("language") etag = document.get("etag", '') updated_parsed = document.get("updated_parsed") if updated_parsed: last_modified = datetime.fromtimestamp(time.mktime(updated_parsed)) else: last_modified = datetime.now() feed_links = document.feed.get("links", []) if not blog_url: link = filter(lambda item: item["rel"]=="alternate", feed_links) if link: blog_url = link[0]["href"] blog, created = Blog.objects.get_or_create( url=blog_url, defaults={"title": title}) generator_dict = document.feed.get("generator_detail", {}) if generator_dict: generator, created = Generator.objects.get_or_create( name=generator_dict.get("name", "--"), link=generator_dict.get("link"), version=generator_dict.get("version")) else: generator = None if category_title: ##TODO: site_objects! category = Category.objects.get(title=category_title) else: category = None planet_feed = Feed(title=title, subtitle=subtitle, blog=blog, url=feed_url, rights=rights, info=info, guid=guid, image_url=image_url, icon_url=icon_url, language=language, etag=etag, last_modified=last_modified, generator=generator, is_active=True, last_checked=datetime.now(), site=current_site, category=category ) planet_feed.save() for tag_dict in document.feed.get("tags", []): name = tag_dict.get("term") if name: print name for link_dict in feed_links: feed_link, created = FeedLink.objects.get_or_create( feed=planet_feed, rel=link_dict.get("rel", "--"), mime_type=link_dict.get("type", "text/html"), link=link_dict.get("href", blog_url) ) entries = [] total_results = int(document.feed.get("opensearch_totalresults", len(document.entries))) items_per_page = int(document.feed.get("opensearch_itemsperpage", 25)) new_posts_count = 0 if total_results == 0: print "No entries to store. status: %s %s" % (document.get("status"), document.get("debug_message")) else: print "Entries total count: %d" % total_results stop_retrieving = False while (total_results > len(entries)) and not stop_retrieving: # retrieve and store feed posts entries.extend(document.entries) print "Processing %d entries" % len(document.entries) for entry in document.entries: title = entry.get("title", "") url = entry.get("link") guid = unicode(md5(entry.get("link")).hexdigest()) content = entry.get('description') or entry.get("content", [{"value": ""}])[0]["value"] comments_url = entry.get("comments") date_modified = entry.get("updated_parsed") or\ entry.get("published_parsed") try: date_modified = datetime.fromtimestamp( time.mktime(date_modified)) except Exception: date_modified = None try: if len(Post.objects.filter(url=url, guid=guid)): raise PostAlreadyExists post = Post(title=title, url=url, guid=guid, content=content, comments_url=comments_url, date_modified=date_modified, feed=planet_feed) # To have the feed entry in the pre_save signal post.entry = entry post.save() except PostAlreadyExists: print "Skipping post %s (%s) because already exists"\ % (guid, url) if not create: # if it is in update-mode then stop retrieving when # it finds repeated posts stop_retrieving = True else: new_posts_count += 1 # create post tags... for tag_dict in entry.get("tags", []): tag_name = tag_dict.get("term") or tag_dict.get("label") tag_name = normalize_tag(tag_name) if len(tag_name) > 50: continue try: if "/" in tag_name: # For path based categories for subtag in tag_name.split("/"): if subtag: # empty string if starts/ends with slash Tag.objects.add_tag(post, '"%s"' % subtag) else: Tag.objects.add_tag(post, '"%s"' % tag_name) except AttributeError, e: print "Ignoring tag error: %s" % e # create post links... for link_dict in entry.get("links", []): post_link, created = PostLink.objects.get_or_create( post=post, rel=link_dict.get("rel", "--"), mime_type=link_dict.get("type", "text/html"), link=link_dict.get("href", "--"), title=link_dict.get("title", "--") ) # create and store enclosures... if entry.get('media_thumbnail', False): try: media_url = entry.get('media_thumbnail').href media_list = [{"url": media_url}] except AttributeError: media_list = entry.get('media_thumbnail', [{"url": None}]) for media in media_list: media_url = media["url"] mime_type, enc = mimetypes.guess_type(urlparse(media_url).path) post_enclosure, created = Enclosure.objects.get_or_create( post=post, length=0, mime_type=mime_type, link=media_url ) for enclosure_dict in entry.get("enclosures", []): post_enclosure = Enclosure( post=post, length=enclosure_dict.get("length", 0), mime_type=enclosure_dict.get("type", ""), link=enclosure_dict.get("href") ) post_enclosure.save() # create and store author... author_dict = entry.get("author_detail") if author_dict: author, created = Author.objects.get_or_create( name=author_dict.get("name", ""), email=author_dict.get("email", ""), profile_url=author_dict.get("href") ) try: PostAuthorData.objects.get(author=author, post=post) except PostAuthorData.DoesNotExist: pad = PostAuthorData(author=author, post=post) pad.save() # create and store contributors... for contributor_dict in entry.get("contributors", []): contributor, created = Author.objects.get_or_create( name=author_dict.get("name", ""), email=author_dict.get("email", ""), profile_url=contributor_dict.get("href") ) try: PostAuthorData.objects.get(author=contributor, post=post) except PostAuthorData.DoesNotExist: pad = PostAuthorData(author=contributor, post=post, is_contributor=True) pad.save() # We send a post_created signal print 'post_created.send(sender=post)', post post_created.send(sender=post, instance=post) if not stop_retrieving: opensearch_url = "%s?start-index=%d&max-results=%d" %\ (feed_url, len(entries) + 1, items_per_page) print "retrieving %s..." % opensearch_url document = feedparser.parse(opensearch_url, agent=USER_AGENT) if new_posts_count: # update last modified datetime planet_feed.last_modified = datetime.now() planet_feed.save() print "%d posts were created. Done." % new_posts_count
def process_feed(feed_url, create=False, category_title=None): """ Stores a feed, its related data, its entries and their related data. If create=True then it creates the feed, otherwise it only stores new entries and their related data. """ def normalize_tag(tag): """ converts things like "-noise-" to "noise" and "- noise -" to "noise" """ if tag.startswith("-"): tag = tag[1:] if tag.endswith("-"): tag = tag[:-1] ## fix for HTML entities tag = BeautifulSoup(tag).prettify(formatter="html") tag = tag.strip().lower() return tag try: USER_AGENT = settings.PLANET["USER_AGENT"] except (KeyError, AttributeError): print( """Please set PLANET = {" USER_AGENT": <string>} in your settings.py""" ) exit(0) feed_url = str(feed_url).strip() try: planet_feed = Feed.objects.get(url=feed_url) except Feed.DoesNotExist: planet_feed = None print("*" * 20) print("Feed: {}".format(feed_url)) if create and planet_feed: # can't create it due to it already exists print("This feed already exists!") exit(0) if not create and not planet_feed: # can't update it due to it does not exist print("This feed does not exist!") exit(0) # retrieve and parse feed using conditional GET method if not create: modified = datetime.timetuple(planet_feed.last_modified) etag = planet_feed.etag # update last checked datetime planet_feed.last_checked = datetime.now() planet_feed.save() else: modified = etag = None document = feedparser.parse(feed_url, agent=USER_AGENT, modified=modified, etag=etag) current_site = Site.objects.get(pk=settings.SITE_ID) if create: # then create blog, feed, generator, feed links and feed tags title = document.feed.get("title", "--") subtitle = document.feed.get("subtitle") blog_url = document.feed.get("link") rights = document.feed.get("rights") or document.feed.get("license") info = document.feed.get("info") try: guid = unicode(md5(document.feed.get("link")).hexdigest()) except NameError: guid = md5(document.feed.get("link").encode('utf-8')).hexdigest() image_url = document.feed.get("image", {}).get("href") icon_url = document.feed.get("icon") language = document.feed.get("language") etag = document.get("etag", '') updated_parsed = document.get("updated_parsed") if updated_parsed: last_modified = datetime.fromtimestamp(time.mktime(updated_parsed)) else: last_modified = datetime.now() feed_links = document.feed.get("links", []) if not blog_url: link = [item for item in feed_links if item["rel"] == "alternate"] if link: blog_url = link[0]["href"] blog, created = Blog.objects.get_or_create(url=blog_url, defaults={"title": title}) generator_dict = document.feed.get("generator_detail", {}) if generator_dict: generator, created = Generator.objects.get_or_create( name=generator_dict.get("name", "--"), link=generator_dict.get("link"), version=generator_dict.get("version")) else: generator = None if category_title: ##TODO: site_objects! category = Category.objects.get(title=category_title) else: category = None planet_feed = Feed(title=title, subtitle=subtitle, blog=blog, url=feed_url, rights=rights, info=info, guid=guid, image_url=image_url, icon_url=icon_url, language=language, etag=etag, last_modified=last_modified, generator=generator, is_active=True, last_checked=datetime.now(), site=current_site, category=category) planet_feed.save() for tag_dict in document.feed.get("tags", []): name = tag_dict.get("term") for link_dict in feed_links: feed_link, created = FeedLink.objects.get_or_create( feed=planet_feed, rel=link_dict.get("rel", "--"), mime_type=link_dict.get("type", "text/html"), link=link_dict.get("href", blog_url)) entries = [] total_results = int( document.feed.get("opensearch_totalresults", len(document.entries))) items_per_page = int(document.feed.get("opensearch_itemsperpage", 25)) new_posts_count = 0 if total_results == 0: print("No entries to store. status: {} {}".format( document.get("status"), document.get("debug_message"))) else: print("Entries total count: {}".format(total_results)) stop_retrieving = False while (total_results > len(entries)) and not stop_retrieving: # retrieve and store feed posts entries.extend(document.entries) print("Processing {} entries".format(len(document.entries))) for entry in document.entries: title = entry.get("title", "") url = entry.get("link") try: guid = unicode(md5(entry.get("link")).hexdigest()) except NameError: guid = md5(entry.get("link").encode('utf-8')).hexdigest() content = entry.get('description') or entry.get( "content", [{ "value": "" }])[0]["value"] comments_url = entry.get("comments") try: image_url = entry["media_thumbnail"][0]["url"] except Exception: image_url = None date_modified = entry.get("updated_parsed") or\ entry.get("published_parsed") try: date_modified = datetime.fromtimestamp( time.mktime(date_modified)) except Exception: date_modified = None try: if len(Post.objects.filter(url=url, guid=guid)): raise PostAlreadyExists #c=Cluster.objects.get(id=1) #CREATE THE VECTOR FIELD MYVECTOR HERE WORD = re.compile(r'\w+') soup = BeautifulSoup(content) combo = [] # desc and title words = nltk.wordpunct_tokenize(soup.get_text()) titlez = nltk.wordpunct_tokenize(title) words.extend(titlez) for word in words: if word in stopwords.words('english'): words.remove(word) for word in words: combo.append(stemmer.stem(word)) lowerwords = [x.lower() for x in combo if len(x) > 1] def text_to_vector(text): words = WORD.findall(text.lower()) return Counter(words) # Making vectors vector = text_to_vector(str(lowerwords)) del vector['u'] vec33 = str(vector) vec = vec33.replace("Counter", "") print(vec) c = Cluster.objects.get(id=1) clus_id = c.cluster_id i = 0 rank = 1 for posts in Post.objects.all(): threshold = 0 if posts.feed != planet_feed: #vec11=posts.myvector.replace("Counter","") #vec22=vector.replace("Counter","") vec1 = ast.literal_eval(posts.myvector) vec2 = ast.literal_eval(vec) threshold = get_cosine(vec1, vec2) if threshold > 0.28: #matchposts[i]=posts posts.rank += 1 posts.save() i += 1 clus_id = posts.cluster_id if i == 0: c.cluster_id += 1 c.save() clus_id = c.cluster_id rank = 1 elif i > 0: rank = i post = Post(title=title, url=url, guid=guid, content=content, comments_url=comments_url, image_url=image_url, date_modified=date_modified, feed=planet_feed, cluster_id=clus_id, rank=rank, myvector=vec, category=category) # To have the feed entry in the pre_save signal post.entry = entry post.save() except PostAlreadyExists: print("Skipping post {} ({}) because already exists"\ .format(guid, url)) if not create: # if it is in update-mode then stop retrieving when # it finds repeated posts stop_retrieving = True else: new_posts_count += 1 # create post tags... for tag_dict in entry.get("tags", []): tag_name = tag_dict.get("term") or tag_dict.get( "label") tag_name = normalize_tag(tag_name) if len(tag_name) > 50: continue try: if "/" in tag_name: # For path based categories for subtag in tag_name.split("/"): if subtag: # empty string if starts/ends with slash Tag.objects.add_tag( post, '"%s"' % subtag) else: Tag.objects.add_tag(post, '"%s"' % tag_name) except AttributeError as e: print("Ignoring tag error: {}".format(e)) # create post links... for link_dict in entry.get("links", []): post_link, created = PostLink.objects.get_or_create( post=post, rel=link_dict.get("rel", "--"), mime_type=link_dict.get("type", "text/html"), link=link_dict.get("href", "--"), title=link_dict.get("title", "--")) # create and store enclosures... if entry.get('media_thumbnail', False): try: media_url = entry.get('media_thumbnail').href media_list = [{"url": media_url}] except AttributeError: media_list = entry.get('media_thumbnail', [{ "url": None }]) for media in media_list: media_url = media["url"] mime_type, enc = mimetypes.guess_type( urlparse(media_url).path) post_enclosure, created = Enclosure.objects.get_or_create( post=post, length=0, mime_type=mime_type, link=media_url) for enclosure_dict in entry.get("enclosures", []): post_enclosure = Enclosure( post=post, length=enclosure_dict.get("length", 0), mime_type=enclosure_dict.get("type", ""), link=enclosure_dict.get("href")) post_enclosure.save() # create and store author... author_dict = entry.get("author_detail") if author_dict: author, created = Author.objects.get_or_create( name=author_dict.get("name", ""), email=author_dict.get("email", ""), profile_url=author_dict.get("href")) try: PostAuthorData.objects.get(author=author, post=post) except PostAuthorData.DoesNotExist: pad = PostAuthorData(author=author, post=post) pad.save() # create and store contributors... for contributor_dict in entry.get("contributors", []): contributor, created = Author.objects.get_or_create( name=author_dict.get("name", ""), email=author_dict.get("email", ""), profile_url=contributor_dict.get("href")) try: PostAuthorData.objects.get(author=contributor, post=post) except PostAuthorData.DoesNotExist: pad = PostAuthorData(author=contributor, post=post, is_contributor=True) pad.save() # We send a post_created signal print('post_created.send(sender=post)', post) post_created.send(sender=post, instance=post) if not stop_retrieving: opensearch_url = "{}?start-index={}&max-results={}".format(\ feed_url, len(entries) + 1, items_per_page) print("retrieving {}...".format(opensearch_url)) document = feedparser.parse(opensearch_url, agent=USER_AGENT) if new_posts_count: # update last modified datetime planet_feed.last_modified = datetime.now() planet_feed.save() print("{} posts were created. Done.".format(new_posts_count)) print() return new_posts_count
def process_feed(feed_url, create=False, category_title=None): """ Stores a feed, its related data, its entries and their related data. If create=True then it creates the feed, otherwise it only stores new entries and their related data. """ #CREATE THE VECTOR FIELD MYVECTOR HERE WORD = re.compile(r'\w+') soup = BeautifulSoup(content) combo=[] # desc and title words= nltk.wordpunct_tokenize(soup.get_text()) titlez = nltk.wordpunct_tokenize(title) words.extend(titlez) for word in words: if word in stopwords.words('english'): words.remove(word) for word in words: combo.append(stemmer.stem(word)) lowerwords=[x.lower() for x in combo if len(x) > 1] def text_to_vector(text): words = WORD.findall(text.lower()) return Counter(words) # Making vectors vector=text_to_vector(str(lowerwords)) del vector['u'] vec33=str(vector) vec=vec33.replace("Counter","") print(vec) c=Cluster.objects.get(id=1) clus_id=c.cluster_id i=0 rank=1 for posts in Post.objects.all(): threshold=0 if posts.feed!=planet_feed: #vec11=posts.myvector.replace("Counter","") #vec22=vector.replace("Counter","") vec1=ast.literal_eval(posts.myvector) vec2=ast.literal_eval(vec) threshold=get_cosine(vec1,vec2) if threshold>0.28: #matchposts[i]=posts posts.rank+=1 posts.save() i+=1 clus_id=posts.cluster_id if i==0: c.cluster_id+=1 c.save() clus_id=c.cluster_id rank=1 elif i>0: rank=i post = Post(title=title, url=url, guid=guid, content=content, comments_url=comments_url, image_url=image_url,date_modified=date_modified, feed=planet_feed,cluster_id=clus_id,rank=rank,myvector=vec,category=category) # To have the feed entry in the pre_save signal post.entry = entry post.save()
def process_feed(feed_url, owner_id=None, create=False, category_title=None): """ Stores a feed, its related data, its entries and their related data. If create=True then it creates the feed, otherwise it only stores new entries and their related data. """ print("[process_feed] URL={}".format(feed_url)) try: USER_AGENT = settings.PLANET["USER_AGENT"] except (KeyError, AttributeError): print( """Please set PLANET = {" USER_AGENT": <string>} in your settings.py""") exit(0) feed_url = str(feed_url).strip() try: planet_feed = Feed.objects.get(url=feed_url) except Feed.DoesNotExist: planet_feed = None print("*" * 20) print("Feed: {}".format(feed_url)) if create and planet_feed: # can't create it due to it already exists print("This feed already exists!") exit(0) if not create and not planet_feed: # can't update it due to it does not exist print("This feed does not exist!") exit(0) # retrieve and parse feed using conditional GET method if not create: modified = datetime.timetuple(planet_feed.last_modified) etag = planet_feed.etag # update last checked datetime planet_feed.last_checked = datetime.now() planet_feed.save() else: modified = etag = None document = feedparser.parse(feed_url, agent=USER_AGENT, modified=modified, etag=etag) current_site = Site.objects.get(pk=settings.SITE_ID) if create: # then create blog, feed, generator, feed links and feed tags title = document.feed.get("title", "--") subtitle = document.feed.get("subtitle") blog_url = document.feed.get("link") rights = document.feed.get("rights") or document.feed.get("license") info = document.feed.get("info") try: guid = str(md5(document.feed.get("link")).hexdigest()) except: guid = md5(document.feed.get("link").encode('utf-8')).hexdigest() image_url = document.feed.get("image", {}).get("href") icon_url = document.feed.get("icon") language = document.feed.get("language") etag = document.get("etag", '') updated_parsed = document.get("updated_parsed") if updated_parsed: last_modified = datetime.fromtimestamp(time.mktime(updated_parsed)) else: last_modified = datetime.now() feed_links = document.feed.get("links", []) if not blog_url: link = [item for item in feed_links if item["rel"] == "alternate"] if link: blog_url = link[0]["href"] User = get_user_model() try: owner = User.objects.get(pk=owner_id) except User.DoesNotExist: owner = None blog, created = Blog.objects.get_or_create( url=blog_url, defaults={"title": title}, owner=owner, short_name = urlparse(blog_url).netloc) generator_dict = document.feed.get("generator_detail", {}) if generator_dict: generator, created = Generator.objects.get_or_create( name=generator_dict.get("name", "--"), link=generator_dict.get("link"), version=generator_dict.get("version")) else: generator = None if category_title: # TODO: site_objects! category = Category.objects.get(title=category_title) else: category = None planet_feed = Feed(title=title, subtitle=subtitle, blog=blog, url=feed_url, rights=rights, info=info, guid=guid, image_url=image_url, icon_url=icon_url, language=language, etag=etag, last_modified=last_modified, generator=generator, is_active=True, last_checked=datetime.now(), site=current_site, category=category ) planet_feed.save() for tag_dict in document.feed.get("tags", []): name = tag_dict.get("term") for link_dict in feed_links: feed_link, created = FeedLink.objects.get_or_create( feed=planet_feed, rel=link_dict.get("rel", "--"), mime_type=link_dict.get("type", "text/html"), link=link_dict.get("href", blog_url) ) entries = [] total_results = int( document.feed.get("opensearch_totalresults", len(document.entries))) items_per_page = int(document.feed.get("opensearch_itemsperpage", 25)) new_posts_count = 0 if total_results == 0: print("No entries to store. status: {} {}".format( document.get("status"), document.get("debug_message"))) else: print("Entries total count: {}".format(total_results)) stop_retrieving = False while (total_results > len(entries)) and not stop_retrieving: # retrieve and store feed posts entries.extend(document.entries) print("Processing {} entries".format(len(document.entries))) for entry in document.entries: title = entry.get("title", "") url = entry.get("link") try: guid = unicode(md5(entry.get("link")).hexdigest()) except NameError: guid = md5(entry.get("link").encode('utf-8')).hexdigest() content = entry.get('description') or entry.get( "content", [{"value": ""}])[0]["value"] comments_url = entry.get("comments") date_modified = entry.get("updated_parsed") or\ entry.get("published_parsed") try: date_modified = datetime.fromtimestamp( time.mktime(date_modified)) except Exception: date_modified = planet_feed.last_modified or datetime.now() try: if len(Post.objects.filter(url=url, guid=guid)): raise PostAlreadyExists post = Post(title=title, url=url, guid=guid, content=content, comments_url=comments_url, date_modified=date_modified, feed=planet_feed) select_matches = post.selectors() except PostAlreadyExists: print("Skipping post {} ({}) because already exists" .format(guid, url)) if not create: # if it is in update-mode then stop retrieving when # it finds repeated posts stop_retrieving = True else: if (select_matches or not PLANET_CONFIG["FILTER_WITH_SELECTORS"]): print("matches!",select_matches,post.title) post.entry = entry post.save() print(" -" * 20) print(post) print(" -" * 20) new_posts_count += 1 make_entry_tags(entry, post) make_selector_tags(select_matches, post) make_links(entry, post) make_enclosures(entry, post) check_content_images(post) make_author_contributors(entry, post) # We send a post_created signal print('post_created.send(sender=post)', post) post_created.send(sender=post, instance=post) if not stop_retrieving: opensearch_url = "{}?start-index={}&max-results={}".format( feed_url, len(entries) + 1, items_per_page) print("retrieving {}...".format(opensearch_url)) document = feedparser.parse(opensearch_url, agent=USER_AGENT) if new_posts_count: # update last modified datetime planet_feed.last_modified = datetime.now() planet_feed.save() print("{} posts were created. Done.".format(new_posts_count)) return new_posts_count
def process_feed(feed_url, create=False, category_title=None): """ Stores a feed, its related data, its entries and their related data. If create=True then it creates the feed, otherwise it only stores new entries and their related data. """ def normalize_tag(tag): """ converts things like "-noise-" to "noise" and "- noise -" to "noise" """ if tag.startswith("-"): tag = tag[1:] if tag.endswith("-"): tag = tag[:-1] ## fix for HTML entities tag = unicode( BeautifulStoneSoup( tag, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) tag = tag.strip().lower() return tag try: USER_AGENT = settings.USER_AGENT except AttributeError: print "Please set the variable USER_AGENT = <string> in your settings.py" exit(0) feed_url = str(feed_url).strip() try: planet_feed = Feed.objects.get(url=feed_url) except Feed.DoesNotExist: planet_feed = None print "*" * 20 print "Feed: %s" % feed_url if create and planet_feed: # can't create it due to it already exists print "This feed already exists!" exit(0) if not create and not planet_feed: # can't update it due to it does not exist print "This feed does not exist!" exit(0) # retrieve and parse feed using conditional GET method if not create: modified = datetime.timetuple(planet_feed.last_modified) etag = planet_feed.etag # update last checked datetime planet_feed.last_checked = datetime.now() planet_feed.save() else: modified = etag = None document = feedparser.parse(feed_url, agent=USER_AGENT, modified=modified, etag=etag) current_site = Site.objects.get(pk=settings.SITE_ID) if create: # then create blog, feed, generator, feed links and feed tags title = document.feed.get("title", "--") subtitle = document.feed.get("subtitle") blog_url = document.feed.get("link") rights = document.feed.get("rights") or document.feed.get("license") info = document.feed.get("info") guid = document.feed.get("id") image_url = document.feed.get("image", {}).get("href") icon_url = document.feed.get("icon") language = document.feed.get("language") etag = document.get("etag", '') last_modified = document.get("updated_parsed", datetime.now()) feed_links = document.feed.get("links", []) if not blog_url: link = filter(lambda item: item["rel"] == "self", feed_links) if link: blog_url = link[0]["href"] else: link = filter(lambda item: item["rel"] == "alternate", feed_links) if link: blog_url = link[0]["href"] blog, created = Blog.objects.get_or_create(url=blog_url, defaults={"title": title}) generator_dict = document.feed.get("generator_detail", {}) if generator_dict: generator, created = Generator.objects.get_or_create( name=generator_dict.get("name", "--"), link=generator_dict.get("link"), version=generator_dict.get("version")) else: generator = None if category_title: ##TODO: site_objects! category = Category.objects.get(title=category_title) else: category = None planet_feed = Feed(title=title, subtitle=subtitle, blog=blog, url=feed_url, rights=rights, info=info, guid=guid, image_url=image_url, icon_url=icon_url, language=language, etag=etag, last_modified=last_modified, generator=generator, is_active=True, last_checked=datetime.now(), site=current_site, category=category) planet_feed.save() for tag_dict in document.feed.get("tags", []): name = tag_dict.get("term") if name: print name for link_dict in feed_links: feed_link, created = FeedLink.objects.get_or_create( feed=planet_feed, rel=link_dict.get("rel", "--"), mime_type=link_dict.get("type") or '', link=link_dict.get("href", blog_url)) entries = [] total_results = int( document.feed.get("opensearch_totalresults", len(document.entries))) items_per_page = int(document.feed.get("opensearch_itemsperpage", 25)) new_posts_count = 0 if total_results == 0: print "No entries to store. status: %s %s" % ( document.get("status"), document.get("debug_message")) else: print "Entries total count: %d" % total_results stop_retrieving = False while (total_results > len(entries)) and not stop_retrieving: # retrieve and store feed posts entries.extend(document.entries) print "Processing %d entries" % len(document.entries) for entry in document.entries: title = entry.get("title", "") url = entry.get("link") guid = entry.get("id") content = entry.get('description') or entry.get( "content", [{ "value": "" }])[0]["value"] comments_url = entry.get("comments") date_modified = entry.get("updated_parsed") or\ entry.get("published_parsed") try: date_modified = datetime.fromtimestamp( time.mktime(date_modified)) except Exception: date_modified = None try: if len(Post.objects.filter(feed=planet_feed, guid=guid)): raise PostAlreadyExists post = Post(title=title, url=url, guid=guid, content=content, comments_url=comments_url, date_modified=date_modified, feed=planet_feed) # To have the feed entry in the pre_save signal post.entry = entry post.save() except PostAlreadyExists: print "Skipping post %s (%s) because already exists"\ % (guid, url) if not create: # if it is in update-mode then stop retrieving when # it finds repeated posts stop_retrieving = True else: new_posts_count += 1 # create post tags... for tag_dict in entry.get("tags", []): tag_name = tag_dict.get("term") or tag_dict.get( "label") tag_name = normalize_tag(tag_name) if len(tag_name) > 50: continue try: if "/" in tag_name: # For path based categories for subtag in tag_name.split("/"): if subtag: # empty string if starts/ends with slash Tag.objects.add_tag( post, '"%s"' % subtag) else: Tag.objects.add_tag(post, '"%s"' % tag_name) except AttributeError, e: print "Ignoring tag error: %s" % e # create post links... for link_dict in entry.get("links", []): post_link, created = PostLink.objects.get_or_create( post=post, rel=link_dict.get("rel", "--"), mime_type=link_dict.get("type", ""), link=link_dict.get("href", "--"), title=link_dict.get("title", "--")) # create and store enclosures... for media_thumbnail in entry.get('media_thumbnail', []): url = media_thumbnail.get('url') mime_type, enc = mimetypes.guess_type( urlparse(url).path) post_enclosure, created = Enclosure.objects.get_or_create( post=post, length=0, mime_type=mime_type or '', link=url) # if entry.get('media_thumbnail', False): # mime_type, enc = mimetypes.guess_type(urlparse(entry.get('media_thumbnail').href).path) # post_enclosure, created = Enclosure.objects.get_or_create( # post=post, # length=0, # mime_type=mime_type, # link=entry.get('media_thumbnail').href # ) for enclosure_dict in entry.get("enclosures", []): post_enclosure = Enclosure( post=post, length=enclosure_dict.get("length", 0), mime_type=enclosure_dict.get("type", ""), link=enclosure_dict.get("href")) post_enclosure.save() # create and store author... author_dict = entry.get("author_detail") if author_dict: author, created = Author.objects.get_or_create( name=author_dict.get("name", ""), email=author_dict.get("email", ""), profile_url=author_dict.get("href")) try: PostAuthorData.objects.get(author=author, post=post) except PostAuthorData.DoesNotExist: pad = PostAuthorData(author=author, post=post) pad.save() # create and store contributors... for contributor_dict in entry.get("contributors", []): contributor, created = Author.objects.get_or_create( name=author_dict.get("name", ""), email=author_dict.get("email", ""), profile_url=contributor_dict.get("href")) try: PostAuthorData.objects.get(author=contributor, post=post) except PostAuthorData.DoesNotExist: pad = PostAuthorData(author=contributor, post=post, is_contributor=True) pad.save() # We send a post_created signal print 'post_created.send(sender=post)', post post_created.send(sender=post, instance=post) if not stop_retrieving: opensearch_url = "%s?start-index=%d&max-results=%d" %\ (feed_url, len(entries) + 1, items_per_page) print "retrieving %s..." % opensearch_url document = feedparser.parse(opensearch_url, agent=USER_AGENT) if new_posts_count: # update last modified datetime planet_feed.last_modified = datetime.now() planet_feed.save() print "%d posts were created. Done." % new_posts_count
def process_feed(feed_url, create=False): """ Stores a feed, its related data, its entries and their related data. If create=True then it creates the feed, otherwise it only stores new entries and their related data. """ def normalize_tag(tag): """ converts things like "-noise-" to "noise" and "- noise -" to "noise" """ if tag.startswith("-"): tag = tag[1:] if tag.endswith("-"): tag = tag[:-1] tag = tag.strip() return tag try: USER_AGENT = settings.USER_AGENT except AttributeError: print "Please set the variable USER_AGENT = <string> in your settings.py" exit(0) feed_url = str(feed_url).strip() try: planet_feed = Feed.objects.get(url=feed_url) except Feed.DoesNotExist: planet_feed = None if create and planet_feed: # can't create it due to it already exists print "This feed already exists!" exit(0) if not create and not planet_feed: # can't update it due to it does not exist print "This feed does not exist!" exit(0) # retrive and parse feed using conditional GET method if not create: modified = datetime.timetuple(planet_feed.last_modified) etag = planet_feed.etag # update last checked datetime planet_feed.last_checked = datetime.now() planet_feed.save() else: modified = etag = None document = feedparser.parse(feed_url, agent=USER_AGENT, modified=modified, etag=etag) current_site = Site.objects.get(pk=settings.SITE_ID) if create: # then create blog, feed, generator, feed links and feed tags title = document.feed.get("title", "--") subtitle = document.feed.get("subtitle") blog_url = document.feed.get("link") rights = document.feed.get("rights") or document.feed.get("license") info = document.feed.get("info") guid = document.feed.get("id") image_url = document.feed.get("image", {}).get("href") icon_url = document.feed.get("icon") language = document.feed.get("language") etag = document.get("etag", '') last_modified = document.get("updated_parsed", datetime.now()) blog, created = Blog.objects.get_or_create( url=blog_url, defaults={"title": title}) generator_dict = document.feed.get("generator_detail", {}) if generator_dict: generator, created = Generator.objects.get_or_create( name=generator_dict.get("name", "--"), link=generator_dict.get("link"), version=generator_dict.get("version")) else: generator = None planet_feed = Feed(title=title, subtitle=subtitle, blog=blog, url=feed_url, rights=rights, info=info, guid=guid, image_url=image_url, icon_url=icon_url, language=language, etag=etag, last_modified=last_modified, generator=generator, is_active=True, last_checked=datetime.now(), site=current_site ) planet_feed.save() for tag_dict in document.feed.get("tags", []): name = tag_dict.get("term") if name: print name for link_dict in document.feed.get("links", []): feed_link, created = FeedLink.objects.get_or_create( feed=planet_feed, rel=link_dict.get("rel", "--"), mime_type=link_dict.get("type"), link=link_dict.get("href", blog_url) ) entries = [] total_results = int(document.feed.get("opensearch_totalresults", len(document.entries))) items_per_page = int(document.feed.get("opensearch_itemsperpage", 25)) if total_results == 0: print "*" * 20 print "Feed: %s" % planet_feed.url print "No entries to store. Exiting..." else: print "Entries total count: %d" % total_results print new_posts_count = 0 stop_retrieving = False while (total_results > len(entries)) and not stop_retrieving: # retrive and store feed posts entries.extend(document.entries) print "Processing %d entries" % len(document.entries) for entry in document.entries: title = entry.get("title", "") url = entry.get("link") guid = entry.get("guid") content = entry.get("content", [{"value": ""}])[0]["value"] comments_url = entry.get("comments") date_modified = entry.get("updated_parsed") or\ entry.get("published_parsed") try: date_modified = datetime.fromtimestamp( time.mktime(date_modified)) except: date_modified = None try: post = Post(title=title, url=url, guid=guid, content=content, comments_url=comments_url, date_modified=date_modified, feed=planet_feed) post.save() except: print "Skipping post %s (%s) because already exists"\ % (guid, url) if not create: # if it is in update-mode then stop retrieving when # it finds repeated posts stop_retrieving = True else: new_posts_count += 1 # create post tags... post_tags = [] for tag_dict in entry.get("tags", []): tag_name = tag_dict.get("term") or tag_dict.get("label") tag_name = tag_name[:255] tag_name = normalize_tag(tag_name) post_tags.append(tag_name) if post_tags: post.tags = " ,".join(set(post_tags)) # create post links... for link_dict in entry.get("links", []): post_link, created = PostLink.objects.get_or_create( post=post, rel=link_dict.get("rel", "--"), mime_type=link_dict.get("type"), link=link_dict.get("href", "--"), title=link_dict.get("title", "--") ) # create and store enclosures... for enclosure_dict in entry.get("enclosures", []): post_enclosure, created = Enclosure.objects.get_or_create( post=post, length=enclosure_dict.get("length", 0), mime_type=enclosure_dict.get("type"), link=enclosure_dict.get("href") ) # create and store author... author_dict = entry.get("author_detail") if author_dict: author, created = Author.objects.get_or_create( name=author_dict.get("name", ""), email=author_dict.get("email", ""), profile_url=author_dict.get("href") ) try: PostAuthorData.objects.get(author=author, post=post) except PostAuthorData.DoesNotExist: pad = PostAuthorData(author=author, post=post) pad.save() # create and store contributors... for contributor_dict in entry.get("contributors", []): contributor, created = Author.objects.get_or_create( name=author_dict.get("name", ""), email=author_dict.get("email", ""), profile_url=contributor_dict.get("href") ) try: PostAuthorData.objects.get(author=contributor, post=post) except PostAuthorData.DoesNotExist: pad = PostAuthorData(author=contributor, post=post, is_contributor=True) pad.save() if not stop_retrieving: opensearch_url = "%s?start-index=%d&max-results=%d" %\ (feed_url, len(entries) + 1, items_per_page) print "retriving %s..." % opensearch_url document = feedparser.parse(opensearch_url, agent=USER_AGENT) print "*" * 20 print "Feed: %s" % planet_feed.url if new_posts_count: # update last modified datetime planet_feed.last_modified = datetime.now() planet_feed.save() print "%d posts were created. Done." % new_posts_count