Exemplo n.º 1
0
    def __store_article(self, contenthtml, feed):
        entry = FeedsResult.objects.get(pk=feed.pk)
        try:

            entry.content = htmllib.decoding(contenthtml)
            entry.fetch_stat = 1
            images = htmllib.Parse_images_url(contenthtml)
            for image in images:
                    obj, result = TempImages.objects.get_or_create(oldurl=image, entry=entry)
        except Exception, data:
                        entry.fetch_stat = 2
                        logging.info('the db saved error is: %s', data)
Exemplo n.º 2
0
    def __store_article(self, contenthtml, feed):
        entry = FeedsResult.objects.get(pk=feed.pk)
        try:

            entry.content = htmllib.decoding(contenthtml)
            entry.fetch_stat = 1
            images = htmllib.Parse_images_url(contenthtml)
            for image in images:
                obj, result = TempImages.objects.get_or_create(oldurl=image,
                                                               entry=entry)
        except Exception, data:
            entry.fetch_stat = 2
            logging.info('the db saved error is: %s', data)
Exemplo n.º 3
0
    def __Parse_image(self, content):
                images = htmllib.Parse_images_url(content)

                if images:
                    try:
                        for image in images:
                                tmpimage = TempImages.objects.get(oldurl=image)
                                if tmpimage != None:
                                        content = gbtools.stringQ2B(content)
                                        content = htmllib.decoding(content).replace(image, tmpimage.newurl)

                    except Exception, data:
                        logging.info(data)
Exemplo n.º 4
0
    def __Parse_image(self, content):
        images = htmllib.Parse_images_url(content)

        if images:
            try:
                for image in images:
                    tmpimage = TempImages.objects.get(oldurl=image)
                    if tmpimage != None:
                        content = gbtools.stringQ2B(content)
                        content = htmllib.decoding(content).replace(
                            image, tmpimage.newurl)

            except Exception, data:
                logging.info(data)
Exemplo n.º 5
0
    def getArticle(self, request, queryset, *arg1, **arg2):
        for feed in queryset:
            logging.info('start to fetch article,The title is %s', feed.title)
            try:
                if feed.feed.start_target != 'nohtml':

                    logging.info('fetch new article %s,at %s' %
                                 (feed.link, datetime.now()))
                    contenthtml = ''
                    try:
                        result = getpage(feed.link, 30)
                        if result.code == 200:
                            if len(
                                    feed.feed.start_target
                            ) != 0 and feed.feed.start_target != 'nohtml':
                                contenthtml = htmllib.parsehtml(
                                    result.read(), feed.feed, feed.link,
                                    feed.feed.feedurl)
                            else:
                                contenthtml = feed.excerpt

                            self.__store_article(contenthtml, feed)

                            return True
                        return False
                    except Exception, data:
                        logging.info('DownloadError in get %s.the error is %s',
                                     feed.link, data)
                        return False
                else:
                    self.__store_article(feed.excerpt, feed)
Exemplo n.º 6
0
    def getFeed(self, request, queryset, *arg1, **arg2):
        logging.info(u'开始采集Feed')
        feed_retrieval_deadline = datetime.now() - timedelta(minutes=1200)

        for feed in queryset:

            if feed.last_retrieved > feed_retrieval_deadline:
                logging.info('Skipping feed %s.', feed.feedurl)
                continue

            logging.info('Getting feed %s.', feed.feedurl)
            try:

                result = getpage(feed.feedurl, 30)
            except Exception:
                logging.warning(
                    'Could not get feed %s ,and the fetch is restart now' %
                    feed.feedurl)
                feed.last_retrieved = datetime.now()
                #feed.save()
                break
            if result.code == 200:
                self.__parse_feed(result.read(), feed.feedurl,
                                  feed.stop_target, feed.category, feed.latest,
                                  feed.start_target, feed.mid_target,
                                  feed.end_target, feed.allow_target)

                feed.last_retrieved = datetime.now()
                feed.save()

            elif result.code == 500:
                logging.error('Feed %s returned with status code 500.' %
                              feed.feedurl)
            elif result.code == 404:
                logging.error('Error 404: Nothing found at %s.' % feed.feedurl)
Exemplo n.º 7
0
    def getArticle(self, request, queryset, *arg1, **arg2):
                for feed in queryset:
                        logging.info('start to fetch article,The title is %s', feed.title)
                        try:
                                if feed.feed.start_target != 'nohtml':

                                        logging.info('fetch new article %s,at %s' % (feed.link, datetime.now()))
                                        contenthtml = ''
                                        try:
                                                result = getpage(feed.link, 30)
                                                if result.code == 200:
                                                        if len(feed.feed.start_target) != 0 and feed.feed.start_target != 'nohtml':
                                                                contenthtml = htmllib.parsehtml(result.read(), feed.feed, feed.link, feed.feed.feedurl)
                                                        else:
                                                                contenthtml = feed.excerpt
                        
                                                        self.__store_article(contenthtml, feed)
                        
                                                        return True
                                                return False
                                        except Exception, data:
                                                logging.info('DownloadError in get %s.the error is %s', feed.link, data)
                                                return False
                                else:
                                    self.__store_article(feed.excerpt, feed)
Exemplo n.º 8
0
    def getFeed(self, request, queryset, *arg1, **arg2):
                logging.info(u'开始采集Feed')
                feed_retrieval_deadline = datetime.now() - timedelta(minutes=1200)
                
                for feed in queryset:

                    if feed.last_retrieved > feed_retrieval_deadline:
                            logging.info('Skipping feed %s.', feed.feedurl)
                            continue

                    logging.info('Getting feed %s.', feed.feedurl)
                    try:

                            result = getpage(feed.feedurl, 30)
                    except Exception:
                            logging.warning('Could not get feed %s ,and the fetch is restart now' % feed.feedurl)
                            feed.last_retrieved = datetime.now()
                            #feed.save()
                            break
                    if result.code == 200:
                            self.__parse_feed(result.read(), feed.feedurl, feed.stop_target, feed.category, feed.latest, feed.start_target, feed.mid_target, feed.end_target, feed.allow_target)

                            feed.last_retrieved = datetime.now()
                            feed.save()

                    elif result.code == 500:
                            logging.error('Feed %s returned with status code 500.' % feed.feedurl)
                    elif result.code == 404:
                            logging.error('Error 404: Nothing found at %s.' % feed.feedurl)
Exemplo n.º 9
0
    def __parse_feed(self, feed_content, feed_url, stop_target, category,
                     feed_latest, start_target, mid_target, end_target,
                     allow_target):
        feed = feedparser.parse(feed_content)
        i = 0
        dead_i = 0
        for entry in feed.entries:
            logging.info('start parse feed,the dead_i is %s', dead_i)
            title = htmllib.decoding(entry.title)
            categorie_keys = []
            content = ''
            date_published = datetime.now()
            author_name = ''
            Mystat = True
            if self.__feedslist_check(title) == False:
                try:
                    i += 1
                    url = ''
                    logging.info('beging to add new article No. %s', i)
                    if (entry.has_key('feedburner_origlink')):
                        url = entry.feedburner_origlink
                    else:
                        url = entry.link
                    if entry.has_key('content'):
                        content = entry.content[0].value
                    else:
                        content = entry.description
                    if entry.has_key('author'):
                        author_name = entry.author
                    else:
                        author_name = "转载"
                    stripper = HTMLStripper()
                    stripper.feed(title)
                    title = stripper.get_data()
                    content = htmllib.decoding(content)
                    content = htmllib.GetFeedclean(url, content, stop_target)
                    if (entry.has_key('updated_parsed')):
                        date_published = datetime(*entry.updated_parsed[:6])
                    else:
                        date_published = datetime.now()
                except Exception, data:
                    logging.warn(
                        'this like something happened,the error is %s', data)

                try:
                    feedresult = self.__store_article(title, url, category,
                                                      content, date_published,
                                                      author_name, feed_url,
                                                      feed)
                    if feedresult == True:
                        logging.info('The No.%s  is fetched to the db', i)
                    else:
                        logging.error('The No.%s is fetched Fail', i)
                        Mystat = False
                except Exception, data:
                    logging.warning('the error is %s', data)
                    Mystat = False
Exemplo n.º 10
0
    def getImages(self, request, queryset, *arg1, **arg2):
        for image in queryset:
            logging.info('start to fetch images,The url is %s', image.oldurl)
            try:
                name = htmllib.sid() + '.jpg'
                result = getpage(htmllib.encoding(image.oldurl), 30)
                if result.code == 200:
                    result = self.__store_images(result.read(), name, image)
                else:
                    result = False
                if result:
                    logging.info('Success!')
                else:
                    logging.info('this one was Fail!')

            except Exception, data:
                logging.info(data)
Exemplo n.º 11
0
    def getImages(self, request, queryset, *arg1, **arg2):
                for image in queryset:
                        logging.info('start to fetch images,The url is %s', image.oldurl)
                        try:
                                name = htmllib.sid() + '.jpg'
                                result = getpage(htmllib.encoding(image.oldurl), 30)
                                if result.code == 200:
                                        result = self.__store_images(result.read(), name, image)
                                else:
                                        result = False
                                if result:
                                        logging.info('Success!')
                                else:
                                        logging.info('this one was Fail!')

                        except Exception, data:
                                logging.info(data)
Exemplo n.º 12
0
    def __parse_feed(self, feed_content, feed_url, stop_target, category, feed_latest, start_target, mid_target, end_target, allow_target):
                feed = feedparser.parse(feed_content)
                i = 0
                dead_i = 0
                for entry in feed.entries:
                        logging.info('start parse feed,the dead_i is %s', dead_i)
                        title = htmllib.decoding(entry.title)
                        categorie_keys = []
                        content = ''
                        date_published = datetime.now()
                        author_name = ''
                        Mystat = True
                        if self.__feedslist_check(title) == False:
                            try:
                                    i += 1
                                    url = ''
                                    logging.info('beging to add new article No. %s', i)
                                    if(entry.has_key('feedburner_origlink')):
                                            url = entry.feedburner_origlink
                                    else:
                                            url = entry.link
                                    if entry.has_key('content'):
                                            content = entry.content[0].value
                                    else:
                                            content = entry.description
                                    if entry.has_key('author'):
                                            author_name = entry.author
                                    else:
                                            author_name = "转载"
                                    stripper = HTMLStripper()
                                    stripper.feed(title)
                                    title = stripper.get_data()
                                    content = htmllib.decoding(content)
                                    content = htmllib.GetFeedclean(url, content, stop_target)
                                    if(entry.has_key('updated_parsed')):
                                            date_published = datetime(*entry.updated_parsed[:6])
                                    else:
                                            date_published = datetime.now()
                            except Exception, data:
                                    logging.warn('this like something happened,the error is %s', data)

                            try:
                                    feedresult = self.__store_article(title, url, category, content, date_published, author_name, feed_url, feed)
                                    if feedresult == True:
                                            logging.info('The No.%s  is fetched to the db', i)
                                    else:
                                            logging.error('The No.%s is fetched Fail', i)
                                            Mystat = False
                            except Exception, data:
                                    logging.warning('the error is %s', data)
                                    Mystat = False
Exemplo n.º 13
0
    def __store_article(self, contenthtml, feed):
        entry = FeedsResult.objects.get(pk=feed.pk)
        try:

            entry.content = htmllib.decoding(contenthtml)
            entry.fetch_stat = 1
            images = htmllib.Parse_images_url(contenthtml)
            for image in images:
                obj, result = TempImages.objects.get_or_create(oldurl=image,
                                                               entry=entry)
        except Exception, data:
            entry.fetch_stat = 2
            logging.info('the db saved error is: %s', data)
        entry.save()
        logging.info('adding the article,the name is %s', feed.title)

    def saveArticle(self, request, queryset, *arg1, **arg2):
        for entry in queryset:
            result = self.__store_entry(entry)

    saveArticle.short_description = u'发布采集'

    def __store_entry(self, feed):
        try:
            entry, result = Entry.published.get_or_create(title=feed.title)
            entry.excerpt = feed.excerpt
            entry.status = 2
            entry.author_name = feed.author_name
            entry.date = feed.date
            entry.slug = htmllib.sid()
Exemplo n.º 14
0
    getArticle.short_description = u'采集正文内容'

    def __store_article(self, contenthtml, feed):
        entry = FeedsResult.objects.get(pk=feed.pk)
        try:

            entry.content = htmllib.decoding(contenthtml)
            entry.fetch_stat = 1
            images = htmllib.Parse_images_url(contenthtml)
            for image in images:
                    obj, result = TempImages.objects.get_or_create(oldurl=image, entry=entry)
        except Exception, data:
                        entry.fetch_stat = 2
                        logging.info('the db saved error is: %s', data)
        entry.save()
        logging.info('adding the article,the name is %s', feed.title)

    def saveArticle(self, request, queryset, *arg1, **arg2):
        for entry in queryset:
            result = self.__store_entry(entry)
            
    saveArticle.short_description = u'发布采集'
    def __store_entry(self, feed):
                try:
                    entry, result = Entry.published.get_or_create(title=feed.title)
                    entry.excerpt = feed.excerpt
                    entry.status = 2
                    entry.author_name = feed.author_name
                    entry.date = feed.date
                    entry.slug = htmllib.sid() 
                    entry.content = self.__Parse_image(feed.content)