示例#1
0
    def __store_article(self, title, url, category, content, date_published, author_name, feed_link, feed):
                try:
                        entry = FeedsResult.objects.get(title=title)
                        return False
                except:
                        entry = FeedsResult(
                                          title=htmllib.decoding(title),
                                          link=url,
                                          excerpt=content,
                                          author_name=htmllib.decoding(author_name),
                                          category=category,
                                          feed=self.model.objects.get(),
                                          date=datetime.now()
                                          )
                       
#                        try:
#                                entry.date = datetime.strptime(date_published[:-6], '%a, %d %b %Y %H:%M:%S')
#                        except:
#                                try:
#                                        entry.date = datetime.strptime(date_published[0:19], '%Y-%m-%d %H:%M:%S')
#                                except:
#                                        entry.date = datetime.now()
#    
                        entry.save()
                        return True
示例#2
0
    def __parse_feed(self, feed_content, feed_url, stop_target, category,
                     feed_latest, start_target, mid_target, end_target,
                     allow_target):
        feed = feedparser.parse(feed_content)
        i = 0
        dead_i = 0
        for entry in feed.entries:
            logging.info('start parse feed,the dead_i is %s', dead_i)
            title = htmllib.decoding(entry.title)
            categorie_keys = []
            content = ''
            date_published = datetime.now()
            author_name = ''
            Mystat = True
            if self.__feedslist_check(title) == False:
                try:
                    i += 1
                    url = ''
                    logging.info('beging to add new article No. %s', i)
                    if (entry.has_key('feedburner_origlink')):
                        url = entry.feedburner_origlink
                    else:
                        url = entry.link
                    if entry.has_key('content'):
                        content = entry.content[0].value
                    else:
                        content = entry.description
                    if entry.has_key('author'):
                        author_name = entry.author
                    else:
                        author_name = "转载"
                    stripper = HTMLStripper()
                    stripper.feed(title)
                    title = stripper.get_data()
                    content = htmllib.decoding(content)
                    content = htmllib.GetFeedclean(url, content, stop_target)
                    if (entry.has_key('updated_parsed')):
                        date_published = datetime(*entry.updated_parsed[:6])
                    else:
                        date_published = datetime.now()
                except Exception, data:
                    logging.warn(
                        'this like something happened,the error is %s', data)

                try:
                    feedresult = self.__store_article(title, url, category,
                                                      content, date_published,
                                                      author_name, feed_url,
                                                      feed)
                    if feedresult == True:
                        logging.info('The No.%s  is fetched to the db', i)
                    else:
                        logging.error('The No.%s is fetched Fail', i)
                        Mystat = False
                except Exception, data:
                    logging.warning('the error is %s', data)
                    Mystat = False
示例#3
0
    def __parse_feed(self, feed_content, feed_url, stop_target, category, feed_latest, start_target, mid_target, end_target, allow_target):
                feed = feedparser.parse(feed_content)
                i = 0
                dead_i = 0
                for entry in feed.entries:
                        logging.info('start parse feed,the dead_i is %s', dead_i)
                        title = htmllib.decoding(entry.title)
                        categorie_keys = []
                        content = ''
                        date_published = datetime.now()
                        author_name = ''
                        Mystat = True
                        if self.__feedslist_check(title) == False:
                            try:
                                    i += 1
                                    url = ''
                                    logging.info('beging to add new article No. %s', i)
                                    if(entry.has_key('feedburner_origlink')):
                                            url = entry.feedburner_origlink
                                    else:
                                            url = entry.link
                                    if entry.has_key('content'):
                                            content = entry.content[0].value
                                    else:
                                            content = entry.description
                                    if entry.has_key('author'):
                                            author_name = entry.author
                                    else:
                                            author_name = "转载"
                                    stripper = HTMLStripper()
                                    stripper.feed(title)
                                    title = stripper.get_data()
                                    content = htmllib.decoding(content)
                                    content = htmllib.GetFeedclean(url, content, stop_target)
                                    if(entry.has_key('updated_parsed')):
                                            date_published = datetime(*entry.updated_parsed[:6])
                                    else:
                                            date_published = datetime.now()
                            except Exception, data:
                                    logging.warn('this like something happened,the error is %s', data)

                            try:
                                    feedresult = self.__store_article(title, url, category, content, date_published, author_name, feed_url, feed)
                                    if feedresult == True:
                                            logging.info('The No.%s  is fetched to the db', i)
                                    else:
                                            logging.error('The No.%s is fetched Fail', i)
                                            Mystat = False
                            except Exception, data:
                                    logging.warning('the error is %s', data)
                                    Mystat = False
示例#4
0
    def __store_article(self, contenthtml, feed):
        entry = FeedsResult.objects.get(pk=feed.pk)
        try:

            entry.content = htmllib.decoding(contenthtml)
            entry.fetch_stat = 1
            images = htmllib.Parse_images_url(contenthtml)
            for image in images:
                    obj, result = TempImages.objects.get_or_create(oldurl=image, entry=entry)
        except Exception, data:
                        entry.fetch_stat = 2
                        logging.info('the db saved error is: %s', data)
示例#5
0
    def __store_article(self, contenthtml, feed):
        entry = FeedsResult.objects.get(pk=feed.pk)
        try:

            entry.content = htmllib.decoding(contenthtml)
            entry.fetch_stat = 1
            images = htmllib.Parse_images_url(contenthtml)
            for image in images:
                obj, result = TempImages.objects.get_or_create(oldurl=image,
                                                               entry=entry)
        except Exception, data:
            entry.fetch_stat = 2
            logging.info('the db saved error is: %s', data)
示例#6
0
    def __Parse_image(self, content):
                images = htmllib.Parse_images_url(content)

                if images:
                    try:
                        for image in images:
                                tmpimage = TempImages.objects.get(oldurl=image)
                                if tmpimage != None:
                                        content = gbtools.stringQ2B(content)
                                        content = htmllib.decoding(content).replace(image, tmpimage.newurl)

                    except Exception, data:
                        logging.info(data)
示例#7
0
    def __Parse_image(self, content):
        images = htmllib.Parse_images_url(content)

        if images:
            try:
                for image in images:
                    tmpimage = TempImages.objects.get(oldurl=image)
                    if tmpimage != None:
                        content = gbtools.stringQ2B(content)
                        content = htmllib.decoding(content).replace(
                            image, tmpimage.newurl)

            except Exception, data:
                logging.info(data)
示例#8
0
    def __store_article(self, title, url, category, content, date_published,
                        author_name, feed_link, feed):
        try:
            entry = FeedsResult.objects.get(title=title)
            return False
        except:
            entry = FeedsResult(title=htmllib.decoding(title),
                                link=url,
                                excerpt=content,
                                author_name=htmllib.decoding(author_name),
                                category=category,
                                feed=self.model.objects.get(),
                                date=datetime.now())

            #                        try:
            #                                entry.date = datetime.strptime(date_published[:-6], '%a, %d %b %Y %H:%M:%S')
            #                        except:
            #                                try:
            #                                        entry.date = datetime.strptime(date_published[0:19], '%Y-%m-%d %H:%M:%S')
            #                                except:
            #                                        entry.date = datetime.now()
            #
            entry.save()
            return True