def __store_article(self, title, url, category, content, date_published, author_name, feed_link, feed): try: entry = FeedsResult.objects.get(title=title) return False except: entry = FeedsResult( title=htmllib.decoding(title), link=url, excerpt=content, author_name=htmllib.decoding(author_name), category=category, feed=self.model.objects.get(), date=datetime.now() ) # try: # entry.date = datetime.strptime(date_published[:-6], '%a, %d %b %Y %H:%M:%S') # except: # try: # entry.date = datetime.strptime(date_published[0:19], '%Y-%m-%d %H:%M:%S') # except: # entry.date = datetime.now() # entry.save() return True
def __parse_feed(self, feed_content, feed_url, stop_target, category, feed_latest, start_target, mid_target, end_target, allow_target): feed = feedparser.parse(feed_content) i = 0 dead_i = 0 for entry in feed.entries: logging.info('start parse feed,the dead_i is %s', dead_i) title = htmllib.decoding(entry.title) categorie_keys = [] content = '' date_published = datetime.now() author_name = '' Mystat = True if self.__feedslist_check(title) == False: try: i += 1 url = '' logging.info('beging to add new article No. %s', i) if (entry.has_key('feedburner_origlink')): url = entry.feedburner_origlink else: url = entry.link if entry.has_key('content'): content = entry.content[0].value else: content = entry.description if entry.has_key('author'): author_name = entry.author else: author_name = "转载" stripper = HTMLStripper() stripper.feed(title) title = stripper.get_data() content = htmllib.decoding(content) content = htmllib.GetFeedclean(url, content, stop_target) if (entry.has_key('updated_parsed')): date_published = datetime(*entry.updated_parsed[:6]) else: date_published = datetime.now() except Exception, data: logging.warn( 'this like something happened,the error is %s', data) try: feedresult = self.__store_article(title, url, category, content, date_published, author_name, feed_url, feed) if feedresult == True: logging.info('The No.%s is fetched to the db', i) else: logging.error('The No.%s is fetched Fail', i) Mystat = False except Exception, data: logging.warning('the error is %s', data) Mystat = False
def __parse_feed(self, feed_content, feed_url, stop_target, category, feed_latest, start_target, mid_target, end_target, allow_target): feed = feedparser.parse(feed_content) i = 0 dead_i = 0 for entry in feed.entries: logging.info('start parse feed,the dead_i is %s', dead_i) title = htmllib.decoding(entry.title) categorie_keys = [] content = '' date_published = datetime.now() author_name = '' Mystat = True if self.__feedslist_check(title) == False: try: i += 1 url = '' logging.info('beging to add new article No. %s', i) if(entry.has_key('feedburner_origlink')): url = entry.feedburner_origlink else: url = entry.link if entry.has_key('content'): content = entry.content[0].value else: content = entry.description if entry.has_key('author'): author_name = entry.author else: author_name = "转载" stripper = HTMLStripper() stripper.feed(title) title = stripper.get_data() content = htmllib.decoding(content) content = htmllib.GetFeedclean(url, content, stop_target) if(entry.has_key('updated_parsed')): date_published = datetime(*entry.updated_parsed[:6]) else: date_published = datetime.now() except Exception, data: logging.warn('this like something happened,the error is %s', data) try: feedresult = self.__store_article(title, url, category, content, date_published, author_name, feed_url, feed) if feedresult == True: logging.info('The No.%s is fetched to the db', i) else: logging.error('The No.%s is fetched Fail', i) Mystat = False except Exception, data: logging.warning('the error is %s', data) Mystat = False
def __store_article(self, contenthtml, feed): entry = FeedsResult.objects.get(pk=feed.pk) try: entry.content = htmllib.decoding(contenthtml) entry.fetch_stat = 1 images = htmllib.Parse_images_url(contenthtml) for image in images: obj, result = TempImages.objects.get_or_create(oldurl=image, entry=entry) except Exception, data: entry.fetch_stat = 2 logging.info('the db saved error is: %s', data)
def __Parse_image(self, content): images = htmllib.Parse_images_url(content) if images: try: for image in images: tmpimage = TempImages.objects.get(oldurl=image) if tmpimage != None: content = gbtools.stringQ2B(content) content = htmllib.decoding(content).replace(image, tmpimage.newurl) except Exception, data: logging.info(data)
def __Parse_image(self, content): images = htmllib.Parse_images_url(content) if images: try: for image in images: tmpimage = TempImages.objects.get(oldurl=image) if tmpimage != None: content = gbtools.stringQ2B(content) content = htmllib.decoding(content).replace( image, tmpimage.newurl) except Exception, data: logging.info(data)
def __store_article(self, title, url, category, content, date_published, author_name, feed_link, feed): try: entry = FeedsResult.objects.get(title=title) return False except: entry = FeedsResult(title=htmllib.decoding(title), link=url, excerpt=content, author_name=htmllib.decoding(author_name), category=category, feed=self.model.objects.get(), date=datetime.now()) # try: # entry.date = datetime.strptime(date_published[:-6], '%a, %d %b %Y %H:%M:%S') # except: # try: # entry.date = datetime.strptime(date_published[0:19], '%Y-%m-%d %H:%M:%S') # except: # entry.date = datetime.now() # entry.save() return True